lxc_container.c 34.1 KB
Newer Older
1
/*
E
Eric Blake 已提交
2
 * Copyright (C) 2008-2011 Red Hat, Inc.
3
 * Copyright (C) 2008 IBM Corp.
4 5 6 7 8
 *
 * lxc_container.c: file description
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
9
 *  Daniel P. Berrange <berrange@redhat.com>
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <config.h>

#include <fcntl.h>
#include <limits.h>
#include <stdlib.h>
31
#include <stdio.h>
32 33
#include <sys/ioctl.h>
#include <sys/mount.h>
34
#include <sys/wait.h>
35
#include <sys/stat.h>
36
#include <unistd.h>
37
#include <mntent.h>
38
#include <dirent.h>
39 40 41 42 43 44

/* Yes, we want linux private one, for _syscall2() macro */
#include <linux/unistd.h>

/* For MS_MOVE */
#include <linux/fs.h>
45

D
Daniel P. Berrange 已提交
46
#if HAVE_CAPNG
47
# include <cap-ng.h>
D
Daniel P. Berrange 已提交
48
#endif
49

50
#include "virterror_internal.h"
51
#include "logging.h"
52 53
#include "lxc_container.h"
#include "util.h"
54
#include "memory.h"
55
#include "veth.h"
56
#include "uuid.h"
E
Eric Blake 已提交
57
#include "virfile.h"
58
#include "command.h"
59

60 61
#define VIR_FROM_THIS VIR_FROM_LXC

62 63 64 65 66 67
/*
 * GLibc headers are behind the kernel, so we define these
 * constants if they're not present already.
 */

#ifndef CLONE_NEWPID
68
# define CLONE_NEWPID  0x20000000
69 70
#endif
#ifndef CLONE_NEWUTS
71
# define CLONE_NEWUTS  0x04000000
72 73
#endif
#ifndef CLONE_NEWUSER
74
# define CLONE_NEWUSER 0x10000000
75 76
#endif
#ifndef CLONE_NEWIPC
77
# define CLONE_NEWIPC  0x08000000
78 79
#endif
#ifndef CLONE_NEWNET
80
# define CLONE_NEWNET  0x40000000 /* New network namespace */
81 82 83 84 85 86 87 88
#endif

/* messages between parent and container */
typedef char lxc_message_t;
#define LXC_CONTINUE_MSG 'c'

typedef struct __lxc_child_argv lxc_child_argv_t;
struct __lxc_child_argv {
89
    virDomainDefPtr config;
90 91
    unsigned int nveths;
    char **veths;
92 93
    int monitor;
    char *ttyPath;
94
    int handshakefd;
95 96 97
};


98
/**
99
 * lxcContainerBuildInitCmd:
100
 * @vmDef: pointer to vm definition structure
101
 *
102
 * Build a virCommandPtr for launching the container 'init' process
103
 *
104
 * Returns a virCommandPtr
105
 */
106
static virCommandPtr lxcContainerBuildInitCmd(virDomainDefPtr vmDef)
107
{
108
    char uuidstr[VIR_UUID_STRING_BUFLEN];
109
    virCommandPtr cmd;
110 111 112

    virUUIDFormat(vmDef->uuid, uuidstr);

113 114 115 116 117 118 119
    cmd = virCommandNew(vmDef->os.init);

    virCommandAddEnvString(cmd, "PATH=/bin:/sbin");
    virCommandAddEnvString(cmd, "TERM=linux");
    virCommandAddEnvPair(cmd, "LIBVIRT_LXC_UUID", uuidstr);
    virCommandAddEnvPair(cmd, "LIBVIRT_LXC_NAME", vmDef->name);

120
    return cmd;
121 122 123
}

/**
124
 * lxcContainerSetStdio:
125 126
 * @control: control FD from parent
 * @ttyfd: FD of tty to set as the container console
127 128 129 130 131 132
 *
 * Sets the given tty as the primary conosole for the container as well as
 * stdout, stdin and stderr.
 *
 * Returns 0 on success or -1 in case of error
 */
133
static int lxcContainerSetStdio(int control, int ttyfd, int handshakefd)
134 135
{
    int rc = -1;
136
    int open_max, i;
137 138

    if (setsid() < 0) {
139
        virReportSystemError(errno, "%s",
140
                             _("setsid failed"));
141
        goto cleanup;
142 143 144
    }

    if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) {
145
        virReportSystemError(errno, "%s",
146
                             _("ioctl(TIOCSTTY) failed"));
147 148 149
        goto cleanup;
    }

150 151 152 153
    /* Just in case someone forget to set FD_CLOEXEC, explicitly
     * close all FDs before executing the container */
    open_max = sysconf (_SC_OPEN_MAX);
    for (i = 0; i < open_max; i++)
154
        if (i != ttyfd && i != control && i != handshakefd) {
155 156 157
            int tmpfd = i;
            VIR_FORCE_CLOSE(tmpfd);
        }
158 159

    if (dup2(ttyfd, 0) < 0) {
160
        virReportSystemError(errno, "%s",
161
                             _("dup2(stdin) failed"));
162 163 164 165
        goto cleanup;
    }

    if (dup2(ttyfd, 1) < 0) {
166
        virReportSystemError(errno, "%s",
167
                             _("dup2(stdout) failed"));
168 169 170 171
        goto cleanup;
    }

    if (dup2(ttyfd, 2) < 0) {
172
        virReportSystemError(errno, "%s",
173
                             _("dup2(stderr) failed"));
174 175 176 177 178 179
        goto cleanup;
    }

    rc = 0;

cleanup:
180
    VIR_DEBUG("rc=%d", rc);
181 182 183 184
    return rc;
}

/**
185
 * lxcContainerSendContinue:
186
 * @control: control FD to child
187
 *
188 189
 * Sends the continue message via the socket pair stored in the vm
 * structure.
190 191 192
 *
 * Returns 0 on success or -1 in case of error
 */
193
int lxcContainerSendContinue(int control)
194 195
{
    int rc = -1;
196 197
    lxc_message_t msg = LXC_CONTINUE_MSG;
    int writeCount = 0;
198

199 200 201
    writeCount = safewrite(control, &msg, sizeof(msg));
    if (writeCount != sizeof(msg)) {
        goto error_out;
202 203
    }

204 205 206
    rc = 0;
error_out:
    return rc;
207 208
}

209
/**
210
 * lxcContainerWaitForContinue:
211
 * @control: Control FD from parent
212 213 214 215 216 217 218
 *
 * This function will wait for the container continue message from the
 * parent process.  It will send this message on the socket pair stored in
 * the vm structure once it has completed the post clone container setup.
 *
 * Returns 0 on success or -1 in case of error
 */
219
int lxcContainerWaitForContinue(int control)
220 221 222 223
{
    lxc_message_t msg;
    int readLen;

224
    readLen = saferead(control, &msg, sizeof(msg));
225 226 227
    if (readLen != sizeof(msg) ||
        msg != LXC_CONTINUE_MSG) {
        return -1;
228 229
    }

230
    return 0;
231 232
}

233

234
/**
235
 * lxcContainerRenameAndEnableInterfaces:
236 237
 * @nveths: number of interfaces
 * @veths: interface names
238
 *
239 240 241
 * This function will rename the interfaces to ethN
 * with id ascending order from zero and enable the
 * renamed interfaces for this container.
242 243 244
 *
 * Returns 0 on success or nonzero in case of error
 */
245 246
static int lxcContainerRenameAndEnableInterfaces(unsigned int nveths,
                                                 char **veths)
247 248
{
    int rc = 0;
249
    unsigned int i;
250
    char *newname = NULL;
251

252
    for (i = 0 ; i < nveths ; i++) {
253 254 255
        if (virAsprintf(&newname, "eth%d", i) < 0) {
            virReportOOMError();
            rc = -1;
256
            goto error_out;
257
        }
258

259
        VIR_DEBUG("Renaming %s to %s", veths[i], newname);
260
        rc = setInterfaceName(veths[i], newname);
261
        if (rc < 0)
262 263
            goto error_out;

264
        VIR_DEBUG("Enabling %s", newname);
265
        rc = vethInterfaceUpOrDown(newname, 1);
266
        if (rc < 0)
267
            goto error_out;
268

269
        VIR_FREE(newname);
270 271 272
    }

    /* enable lo device only if there were other net devices */
273
    if (veths)
274 275 276
        rc = vethInterfaceUpOrDown("lo", 1);

error_out:
277
    VIR_FREE(newname);
278 279 280
    return rc;
}

281

282
/*_syscall2(int, pivot_root, char *, newroot, const char *, oldroot)*/
283 284 285 286 287 288 289
extern int pivot_root(const char * new_root,const char * put_old);

static int lxcContainerChildMountSort(const void *a, const void *b)
{
  const char **sa = (const char**)a;
  const char **sb = (const char**)b;

E
Eric Blake 已提交
290
  /* Deliberately reversed args - we need to unmount deepest
291 292 293 294
     children first */
  return strcmp(*sb, *sa);
}

295
#ifndef MS_REC
296
# define MS_REC          16384
297 298 299
#endif

#ifndef MNT_DETACH
300
# define MNT_DETACH      0x00000002
301 302 303
#endif

#ifndef MS_PRIVATE
304
# define MS_PRIVATE              (1<<18)
305 306 307
#endif

#ifndef MS_SLAVE
308
# define MS_SLAVE                (1<<19)
309 310
#endif

311 312
static int lxcContainerPivotRoot(virDomainFSDefPtr root)
{
313
    int ret;
314
    char *oldroot = NULL, *newroot = NULL;
315

M
Mark McLoughlin 已提交
316 317
    ret = -1;

318 319
    /* root->parent must be private, so make / private. */
    if (mount("", "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0) {
320
        virReportSystemError(errno, "%s",
321
                             _("Failed to make root private"));
322
        goto err;
323 324
    }

325
    if (virAsprintf(&oldroot, "%s/.oldroot", root->src) < 0) {
326
        virReportOOMError();
327
        goto err;
328 329
    }

330 331
    if (virFileMakePath(oldroot) < 0) {
        virReportSystemError(errno,
332
                             _("Failed to create %s"),
333
                             oldroot);
334 335 336 337 338
        goto err;
    }

    /* Create a tmpfs root since old and new roots must be
     * on separate filesystems */
339
    if (mount("tmprootfs", oldroot, "tmpfs", 0, NULL) < 0) {
340
        virReportSystemError(errno,
341
                             _("Failed to mount empty tmpfs at %s"),
342 343 344
                             oldroot);
        goto err;
    }
M
Mark McLoughlin 已提交
345

346 347
    /* Create a directory called 'new' in tmpfs */
    if (virAsprintf(&newroot, "%s/new", oldroot) < 0) {
348
        virReportOOMError();
349 350 351
        goto err;
    }

352 353
    if (virFileMakePath(newroot) < 0) {
        virReportSystemError(errno,
354
                             _("Failed to create %s"),
355 356 357 358 359 360
                             newroot);
        goto err;
    }

    /* ... and mount our root onto it */
    if (mount(root->src, newroot, NULL, MS_BIND|MS_REC, NULL) < 0) {
361
        virReportSystemError(errno,
362
                             _("Failed to bind new root %s into tmpfs"),
363 364 365 366
                             root->src);
        goto err;
    }

367 368 369 370 371 372 373 374 375
    if (root->readonly) {
        if (mount(root->src, newroot, NULL, MS_BIND|MS_REC|MS_RDONLY|MS_REMOUNT, NULL) < 0) {
            virReportSystemError(errno,
                                 _("Failed to make new root %s readonly"),
                                 root->src);
            goto err;
        }
    }

376 377
    /* Now we chroot into the tmpfs, then pivot into the
     * root->src bind-mounted onto '/new' */
378
    if (chdir(newroot) < 0) {
379
        virReportSystemError(errno,
380
                             _("Failed to chroot into %s"), newroot);
381
        goto err;
382 383 384 385
    }

    /* The old root directory will live at /.oldroot after
     * this and will soon be unmounted completely */
386
    if (pivot_root(".", ".oldroot") < 0) {
387
        virReportSystemError(errno, "%s",
388
                             _("Failed to pivot root"));
389
        goto err;
390 391 392
    }

    /* CWD is undefined after pivot_root, so go to / */
393 394 395
    if (chdir("/") < 0)
        goto err;

M
Mark McLoughlin 已提交
396 397 398
    ret = 0;

err:
399 400 401
    VIR_FREE(oldroot);
    VIR_FREE(newroot);

M
Mark McLoughlin 已提交
402
    return ret;
403 404
}

405

406
static int lxcContainerMountBasicFS(const char *srcprefix, bool pivotRoot)
407 408
{
    const struct {
409
        bool onlyPivotRoot;
410
        bool needPrefix;
411 412 413
        const char *src;
        const char *dst;
        const char *type;
414 415
        const char *opts;
        int mflags;
416
    } mnts[] = {
417 418 419
        /* When we want to make a bind mount readonly, for unknown reasons,
         * it is currently neccessary to bind it once, and then remount the
         * bind with the readonly flag. If this is not done, then the original
420
         * mount point in the main OS becomes readonly too which is not what
421 422
         * we want. Hence some things have two entries here.
         */
423 424 425 426 427 428 429 430
        { true, false, "devfs", "/dev", "tmpfs", "mode=755", MS_NOSUID },
        { false, false, "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV },
        { false, false, "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND },
        { false, false, "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY },
        { false, true, "/sys", "/sys", NULL, NULL, MS_BIND },
        { false, true, "/sys", "/sys", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY },
        { false, true, "/selinux", "/selinux", NULL, NULL, MS_BIND },
        { false, true, "/selinux", "/selinux", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY },
431
    };
432
    int i, rc = -1;
433

434 435
    VIR_DEBUG("Mounting basic filesystems %s pivotRoot=%d", NULLSTR(srcprefix), pivotRoot);

436
    for (i = 0 ; i < ARRAY_CARDINALITY(mnts) ; i++) {
437 438
        char *src = NULL;
        const char *srcpath = NULL;
439 440 441 442 443 444

        VIR_DEBUG("Consider %s onlyPivotRoot=%d",
                  mnts[i].src, mnts[i].onlyPivotRoot);
        if (mnts[i].onlyPivotRoot && !pivotRoot)
            continue;

445
        if (virFileMakePath(mnts[i].dst) < 0) {
446
            virReportSystemError(errno,
447
                                 _("Failed to mkdir %s"),
448
                                 mnts[i].src);
449
            goto cleanup;
450
        }
451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466

        if (mnts[i].needPrefix && srcprefix) {
            if (virAsprintf(&src, "%s%s", srcprefix, mnts[i].src) < 0) {
                virReportOOMError();
                goto cleanup;
            }
            srcpath = src;
        } else {
            srcpath = mnts[i].src;
        }

        /* Skip if mount doesn't exist in source */
        if ((srcpath[0] == '/') &&
            (access(srcpath, R_OK) < 0))
            continue;

467 468
        VIR_DEBUG("Mount %s on %s type=%s flags=%x, opts=%s",
                  srcpath, mnts[i].dst, mnts[i].type, mnts[i].mflags, mnts[i].opts);
469 470
        if (mount(srcpath, mnts[i].dst, mnts[i].type, mnts[i].mflags, mnts[i].opts) < 0) {
            VIR_FREE(src);
471
            virReportSystemError(errno,
472 473
                                 _("Failed to mount %s on %s type %s"),
                                 mnts[i].src, mnts[i].dst, NULLSTR(mnts[i].type));
474
            goto cleanup;
475
        }
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
        VIR_FREE(src);
    }

    rc = 0;

cleanup:
    VIR_DEBUG("rc=%d", rc);
    return rc;
}


static int lxcContainerMountDevFS(virDomainFSDefPtr root)
{
    char *devpts = NULL;
    int rc = -1;

    if (virAsprintf(&devpts, "/.oldroot%s/dev/pts", root->src) < 0) {
        virReportOOMError();
        goto cleanup;
495
    }
496

497 498
    if (virFileMakePath("/dev/pts") < 0) {
        virReportSystemError(errno, "%s",
499
                             _("Cannot create /dev/pts"));
500
        goto cleanup;
501
    }
502 503 504

    VIR_DEBUG("Trying to move %s to %s", devpts, "/dev/pts");
    if ((rc = mount(devpts, "/dev/pts", NULL, MS_MOVE, NULL)) < 0) {
505
        virReportSystemError(errno, "%s",
506
                             _("Failed to mount /dev/pts in container"));
507
        goto cleanup;
508
    }
509 510 511 512

    rc = 0;

 cleanup:
513 514
    VIR_FREE(devpts);

515
    return rc;
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
}

static int lxcContainerPopulateDevices(void)
{
    int i;
    const struct {
        int maj;
        int min;
        mode_t mode;
        const char *path;
    } devs[] = {
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" },
    };
533 534 535 536

    /* Populate /dev/ with a few important bits */
    for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) {
        dev_t dev = makedev(devs[i].maj, devs[i].min);
537
        if (mknod(devs[i].path, S_IFCHR, dev) < 0 ||
538
            chmod(devs[i].path, devs[i].mode)) {
539
            virReportSystemError(errno,
540
                                 _("Failed to make device %s"),
541
                                 devs[i].path);
542 543 544 545
            return -1;
        }
    }

S
Serge Hallyn 已提交
546 547 548 549 550 551 552 553
    dev_t dev = makedev(LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX);
    if (mknod("/dev/ptmx", S_IFCHR, dev) < 0 ||
        chmod("/dev/ptmx", 0666)) {
        virReportSystemError(errno, "%s",
                             _("Failed to make device /dev/ptmx"));
        return -1;
    }

554
    if (access("/dev/pts/ptmx", W_OK) == 0) {
S
Serge Hallyn 已提交
555
        if (mount("/dev/pts/ptmx", "/dev/ptmx", "ptmx", MS_BIND, NULL) < 0) {
556
            virReportSystemError(errno, "%s",
S
Serge Hallyn 已提交
557
                                 _("Failed to bind-mount /dev/ptmx to /dev/pts/ptmx"));
558 559 560 561
            return -1;
        }
    }

562 563 564 565 566 567 568 569 570
    /* XXX we should allow multiple consoles per container
     * for tty2, tty3, etc, but the domain XML does not
     * handle this yet
     */
    if (symlink("/dev/pts/0", "/dev/tty1") < 0) {
        virReportSystemError(errno, "%s",
                             _("Failed to symlink /dev/pts/0 to /dev/tty1"));
        return -1;
    }
571 572 573 574 575
    if (symlink("/dev/pts/0", "/dev/console") < 0) {
        virReportSystemError(errno, "%s",
                             _("Failed to symlink /dev/pts/0 to /dev/console"));
        return -1;
    }
576

577 578 579 580
    return 0;
}


581 582
static int lxcContainerMountFSBind(virDomainFSDefPtr fs,
                                   const char *srcprefix)
583
{
584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
    char *src = NULL;
    int ret = -1;

    if (virAsprintf(&src, "%s%s", srcprefix, fs->src) < 0) {
        virReportOOMError();
        goto cleanup;
    }

    if (virFileMakePath(fs->dst) < 0) {
        virReportSystemError(errno,
                             _("Failed to create %s"),
                             fs->dst);
        goto cleanup;
    }

    if (mount(src, fs->dst, NULL, MS_BIND, NULL) < 0) {
        virReportSystemError(errno,
                             _("Failed to bind mount directory %s to %s"),
                             src, fs->dst);
        goto cleanup;
    }

606 607 608 609 610 611
    if (fs->readonly) {
        VIR_DEBUG("Binding %s readonly", fs->dst);
        if (mount(fs->dst, fs->dst, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
            virReportSystemError(errno,
                                 _("Failed to make directory %s readonly"),
                                 fs->dst);
612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679
        }
    }

    ret = 0;

cleanup:
    VIR_FREE(src);
    return ret;
}



/*
 * This functions attempts to do automatic detection of filesystem
 * type following the same rules as the util-linux 'mount' binary.
 *
 * The main difference is that we don't (currently) try to use
 * libblkid to detect the format first. We go straight to using
 * /etc/filesystems, and then /proc/filesystems
 */
static int lxcContainerMountFSBlockAuto(virDomainFSDefPtr fs,
                                        int fsflags,
                                        const char *src,
                                        const char *srcprefix)
{
    FILE *fp = NULL;
    int ret = -1;
    bool tryProc = false;
    bool gotStar = false;
    char *fslist = NULL;
    char *line = NULL;
    const char *type;

    VIR_DEBUG("src=%s srcprefix=%s dst=%s", src, srcprefix, fs->dst);

    /* First time around we use /etc/filesystems */
retry:
    if (virAsprintf(&fslist, "%s%s",
                    srcprefix, tryProc ? "/proc/filesystems" : "/etc/filesystems") < 0) {
        virReportOOMError();
        goto cleanup;
    }

    VIR_DEBUG("Open fslist %s", fslist);
    if (!(fp = fopen(fslist, "r"))) {
        /* If /etc/filesystems does not exist, then we need to retry
         * with /proc/filesystems next
         */
        if (errno == ENOENT &&
            !tryProc) {
            tryProc = true;
            VIR_FREE(fslist);
            goto retry;
        }

        virReportSystemError(errno,
                             _("Unable to read %s"),
                             fslist);
        goto cleanup;
    }

    while (!feof(fp)) {
        size_t n;
        VIR_FREE(line);
        if (getline(&line, &n, fp) <= 0) {
            if (feof(fp))
                break;

680 681 682
            goto cleanup;
        }

683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
        if (strstr(line, "nodev"))
            continue;

        type = strchr(line, '\n');
        if (type)
            line[type-line] = '\0';

        type = line;
        virSkipSpaces(&type);

        /*
         * /etc/filesystems is only allowed to contain '*' on the last line
         */
        if (gotStar) {
            lxcError(VIR_ERR_INTERNAL_ERROR,
                     _("%s has unexpected '*' before last line"),
                     fslist);
            goto cleanup;
        }

        /* An '*' on the last line in /etc/filesystems
         * means try /proc/filesystems next. We don't
         * jump immediately though, since we need to see
         * if any more lines follow
         */
        if (!tryProc &&
            STREQ(type, "*"))
            gotStar = true;

        VIR_DEBUG("Trying mount %s with %s", src, type);
        if (mount(src, fs->dst, type, fsflags, NULL) < 0) {
            /* These errnos indicate a bogus filesystem type for
             * the image we have, so skip to the next type
             */
            if (errno == EINVAL || errno == ENODEV)
                continue;

            virReportSystemError(errno,
                                 _("Failed to bind mount directory %s to %s"),
                                 src, fs->dst);
            goto cleanup;
        }

        ret = 0;
        break;
728 729
    }

730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789
    /* We've got to the end of /etc/filesystems and saw
     * a '*', so we must try /proc/filesystems next
     */
    if (ret != 0 &&
        !tryProc &&
        gotStar) {
        tryProc = true;
        VIR_FREE(fslist);
        VIR_FORCE_FCLOSE(fp);
        goto retry;
    }

    VIR_DEBUG("Done mounting filesystem ret=%d tryProc=%d", ret, tryProc);

cleanup:
    VIR_FREE(line);
    VIR_FORCE_FCLOSE(fp);
    return ret;
}


/*
 * Mount a block device 'src' on fs->dst, automatically
 * probing for filesystem type
 */
static int lxcContainerMountFSBlockHelper(virDomainFSDefPtr fs,
                                          const char *src,
                                          const char *srcprefix)
{
    int fsflags = 0;
    int ret = -1;
    if (fs->readonly)
        fsflags |= MS_RDONLY;

    if (virFileMakePath(fs->dst) < 0) {
        virReportSystemError(errno,
                             _("Failed to create %s"),
                             fs->dst);
        goto cleanup;
    }

    ret = lxcContainerMountFSBlockAuto(fs, fsflags, src, srcprefix);

cleanup:
    return ret;
}


static int lxcContainerMountFSBlock(virDomainFSDefPtr fs,
                                    const char *srcprefix)
{
    char *src = NULL;
    int ret = -1;

    if (virAsprintf(&src, "%s%s", srcprefix, fs->src) < 0) {
        virReportOOMError();
        goto cleanup;
    }

    ret = lxcContainerMountFSBlockHelper(fs, src, srcprefix);
790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806

    VIR_DEBUG("Done mounting filesystem ret=%d", ret);

cleanup:
    VIR_FREE(src);
    return ret;
}


static int lxcContainerMountFS(virDomainFSDefPtr fs,
                               const char *srcprefix)
{
    switch (fs->type) {
    case VIR_DOMAIN_FS_TYPE_MOUNT:
        if (lxcContainerMountFSBind(fs, srcprefix) < 0)
            return -1;
        break;
807 808 809 810
    case VIR_DOMAIN_FS_TYPE_BLOCK:
        if (lxcContainerMountFSBlock(fs, srcprefix) < 0)
            return -1;
        break;
811 812 813 814 815
    case VIR_DOMAIN_FS_TYPE_FILE:
        lxcError(VIR_ERR_INTERNAL_ERROR,
                 _("Unexpected filesystem type %s"),
                 virDomainFSTypeToString(fs->type));
        break;
816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
    default:
        lxcError(VIR_ERR_CONFIG_UNSUPPORTED,
                 _("Cannot mount filesystem type %s"),
                 virDomainFSTypeToString(fs->type));
        break;
    }
    return 0;
}


static int lxcContainerMountAllFS(virDomainDefPtr vmDef,
                                  const char *dstprefix,
                                  bool skipRoot)
{
    size_t i;
    VIR_DEBUG("Mounting %s %d", dstprefix, skipRoot);
832 833

    /* Pull in rest of container's mounts */
834
    for (i = 0 ; i < vmDef->nfss ; i++) {
835 836
        if (skipRoot &&
            STREQ(vmDef->fss[i]->dst, "/"))
837 838
            continue;

839
        if (lxcContainerMountFS(vmDef->fss[i], dstprefix) < 0)
840 841 842
            return -1;
    }

843
    VIR_DEBUG("Mounted all filesystems");
844 845 846 847 848 849
    return 0;
}


static int lxcContainerUnmountOldFS(void)
{
850
    struct mntent mntent;
851 852 853 854
    char **mounts = NULL;
    int nmounts = 0;
    FILE *procmnt;
    int i;
855
    char mntbuf[1024];
856 857

    if (!(procmnt = setmntent("/proc/mounts", "r"))) {
858
        virReportSystemError(errno, "%s",
859
                             _("Failed to read /proc/mounts"));
860 861
        return -1;
    }
862
    while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) {
863
        VIR_DEBUG("Got %s", mntent.mnt_dir);
864
        if (!STRPREFIX(mntent.mnt_dir, "/.oldroot"))
865 866 867 868
            continue;

        if (VIR_REALLOC_N(mounts, nmounts+1) < 0) {
            endmntent(procmnt);
869
            virReportOOMError();
870 871
            return -1;
        }
872
        if (!(mounts[nmounts++] = strdup(mntent.mnt_dir))) {
873
            endmntent(procmnt);
874
            virReportOOMError();
875 876 877 878 879
            return -1;
        }
    }
    endmntent(procmnt);

880 881 882
    if (mounts)
        qsort(mounts, nmounts, sizeof(mounts[0]),
              lxcContainerChildMountSort);
883 884

    for (i = 0 ; i < nmounts ; i++) {
885
        VIR_DEBUG("Umount %s", mounts[i]);
886
        if (umount(mounts[i]) < 0) {
887
            virReportSystemError(errno,
888
                                 _("Failed to unmount '%s'"),
889
                                 mounts[i]);
890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906
            return -1;
        }
        VIR_FREE(mounts[i]);
    }
    VIR_FREE(mounts);

    return 0;
}


/* Got a FS mapped to /, we're going the pivot_root
 * approach to do a better-chroot-than-chroot
 * this is based on this thread http://lkml.org/lkml/2008/3/5/29
 */
static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef,
                                      virDomainFSDefPtr root)
{
907
    /* Gives us a private root, leaving all parent OS mounts on /.oldroot */
908 909 910
    if (lxcContainerPivotRoot(root) < 0)
        return -1;

911
    /* Mounts the core /proc, /sys, etc filesystems */
912
    if (lxcContainerMountBasicFS("/.oldroot", true) < 0)
913 914 915 916
        return -1;

    /* Mounts /dev and /dev/pts */
    if (lxcContainerMountDevFS(root) < 0)
917 918
        return -1;

919
    /* Populates device nodes in /dev/ */
920 921 922
    if (lxcContainerPopulateDevices() < 0)
        return -1;

923
    /* Sets up any non-root mounts from guest config */
924
    if (lxcContainerMountAllFS(vmDef, "/.oldroot", true) < 0)
925 926
        return -1;

927
    /* Gets rid of all remaining mounts from host OS, including /.oldroot itself */
928 929 930 931 932 933
    if (lxcContainerUnmountOldFS() < 0)
        return -1;

    return 0;
}

934

935 936 937 938
/* Nothing mapped to /, we're using the main root,
   but with extra stuff mapped in */
static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef)
{
939 940 941 942 943 944
    VIR_DEBUG("def=%p", vmDef);
    /*
     * This makes sure that any new filesystems in the
     * host OS propagate to the container, but any
     * changes in the container are private
     */
945
    if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
946
        virReportSystemError(errno, "%s",
947
                             _("Failed to make / slave"));
948 949
        return -1;
    }
950

951
    VIR_DEBUG("Mounting config FS");
952 953
    if (lxcContainerMountAllFS(vmDef, "", false) < 0)
        return -1;
954

955
    /* Mounts the core /proc, /sys, etc filesystems */
956
    if (lxcContainerMountBasicFS(NULL, false) < 0)
957 958
        return -1;

959
    VIR_DEBUG("Mounting completed");
960 961 962
    return 0;
}

963 964
static int lxcContainerSetupMounts(virDomainDefPtr vmDef,
                                   virDomainFSDefPtr root)
965 966 967 968 969 970 971
{
    if (root)
        return lxcContainerSetupPivotRoot(vmDef, root);
    else
        return lxcContainerSetupExtraMounts(vmDef);
}

D
Daniel P. Berrange 已提交
972 973 974 975 976 977 978

/*
 * This is running as the 'init' process insid the container.
 * It removes some capabilities that could be dangerous to
 * host system, since they are not currently "containerized"
 */
static int lxcContainerDropCapabilities(void)
979
{
D
Daniel P. Berrange 已提交
980 981 982 983 984 985 986 987 988 989 990 991 992 993
#if HAVE_CAPNG
    int ret;

    capng_get_caps_process();

    if ((ret = capng_updatev(CAPNG_DROP,
                             CAPNG_EFFECTIVE | CAPNG_PERMITTED |
                             CAPNG_INHERITABLE | CAPNG_BOUNDING_SET,
                             CAP_SYS_BOOT, /* No use of reboot */
                             CAP_SYS_MODULE, /* No kernel module loading */
                             CAP_SYS_TIME, /* No changing the clock */
                             CAP_AUDIT_CONTROL, /* No messing with auditing status */
                             CAP_MAC_ADMIN, /* No messing with LSM config */
                             -1 /* sentinal */)) < 0) {
994
        lxcError(VIR_ERR_INTERNAL_ERROR,
995
                 _("Failed to remove capabilities: %d"), ret);
D
Daniel P. Berrange 已提交
996 997
        return -1;
    }
998

D
Daniel P. Berrange 已提交
999
    if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
1000
        lxcError(VIR_ERR_INTERNAL_ERROR,
1001
                 _("Failed to apply capabilities: %d"), ret);
D
Daniel P. Berrange 已提交
1002
        return -1;
1003
    }
D
Daniel P. Berrange 已提交
1004

1005 1006 1007 1008 1009
    /* We do not need to call capng_lock() in this case. The bounding
     * set restriction will prevent them reacquiring sys_boot/module/time,
     * etc which is all that matters for the container. Once inside the
     * container it is fine for SECURE_NOROOT / SECURE_NO_SETUID_FIXUP to
     * be unmasked  - they can never escape the bounding set. */
D
Daniel P. Berrange 已提交
1010 1011

#else
1012
    VIR_WARN("libcap-ng support not compiled in, unable to clear capabilities");
D
Daniel Veillard 已提交
1013
#endif
1014 1015 1016 1017
    return 0;
}


1018
/**
1019 1020
 * lxcContainerChild:
 * @data: pointer to container arguments
1021 1022 1023 1024 1025 1026 1027 1028 1029
 *
 * This function is run in the process clone()'d in lxcStartContainer.
 * Perform a number of container setup tasks:
 *     Setup container file system
 *     mount container /proca
 * Then exec's the container init
 *
 * Returns 0 on success or -1 in case of error
 */
1030
static int lxcContainerChild( void *data )
1031
{
1032
    lxc_child_argv_t *argv = data;
1033
    virDomainDefPtr vmDef = argv->config;
1034
    int ttyfd = -1;
1035
    int ret = -1;
1036
    char *ttyPath = NULL;
1037
    virDomainFSDefPtr root;
1038
    virCommandPtr cmd = NULL;
1039 1040

    if (NULL == vmDef) {
1041
        lxcError(VIR_ERR_INTERNAL_ERROR,
J
Jim Meyering 已提交
1042
                 "%s", _("lxcChild() passed invalid vm definition"));
1043
        goto cleanup;
1044 1045
    }

1046 1047 1048
    cmd = lxcContainerBuildInitCmd(vmDef);
    virCommandWriteArgLog(cmd, 1);

1049
    root = virDomainGetRootFilesystem(vmDef);
1050

1051 1052
    if (root) {
        if (virAsprintf(&ttyPath, "%s%s", root->src, argv->ttyPath) < 0) {
1053
            virReportOOMError();
1054
            goto cleanup;
1055 1056 1057
        }
    } else {
        if (!(ttyPath = strdup(argv->ttyPath))) {
1058
            virReportOOMError();
1059
            goto cleanup;
1060 1061
        }
    }
1062
    VIR_DEBUG("Container TTY path: %s", ttyPath);
1063 1064

    ttyfd = open(ttyPath, O_RDWR|O_NOCTTY);
1065
    if (ttyfd < 0) {
1066
        virReportSystemError(errno,
1067
                             _("Failed to open tty %s"),
1068
                             ttyPath);
1069
        goto cleanup;
1070
    }
1071

1072
    if (lxcContainerSetupMounts(vmDef, root) < 0)
1073
        goto cleanup;
1074

1075 1076 1077 1078 1079 1080 1081
    if (!virFileExists(vmDef->os.init)) {
        virReportSystemError(errno,
                    _("cannot find init path '%s' relative to container root"),
                    vmDef->os.init);
        goto cleanup;
    }

1082
    /* Wait for interface devices to show up */
1083 1084 1085
    if (lxcContainerWaitForContinue(argv->monitor) < 0) {
        virReportSystemError(errno, "%s",
                             _("Failed to read the container continue message"));
1086
        goto cleanup;
1087 1088
    }
    VIR_DEBUG("Received container continue message");
1089

1090 1091
    /* rename and enable interfaces */
    if (lxcContainerRenameAndEnableInterfaces(argv->nveths,
1092
                                              argv->veths) < 0) {
1093
        goto cleanup;
1094
    }
1095

1096
    /* drop a set of root capabilities */
D
Daniel P. Berrange 已提交
1097
    if (lxcContainerDropCapabilities() < 0)
1098
        goto cleanup;
1099

1100 1101 1102 1103 1104 1105 1106
    if (lxcContainerSendContinue(argv->handshakefd) < 0) {
        virReportSystemError(errno, "%s",
                            _("failed to send continue signal to controller"));
        goto cleanup;
    }

    if (lxcContainerSetStdio(argv->monitor, ttyfd, argv->handshakefd) < 0) {
1107 1108
        goto cleanup;
    }
1109

1110
    ret = 0;
1111
cleanup:
1112 1113
    VIR_FREE(ttyPath);
    VIR_FORCE_CLOSE(ttyfd);
1114
    VIR_FORCE_CLOSE(argv->monitor);
1115
    VIR_FORCE_CLOSE(argv->handshakefd);
1116 1117 1118 1119 1120 1121

    if (ret == 0) {
        /* this function will only return if an error occured */
        ret = virCommandExec(cmd);
    }

1122 1123
    virCommandFree(cmd);
    return ret;
1124
}
1125

1126 1127
static int userns_supported(void)
{
1128 1129 1130 1131 1132 1133
#if 1
    /*
     * put off using userns until uid mapping is implemented
     */
    return 0;
#else
1134
    return lxcContainerAvailable(LXC_CONTAINER_FEATURE_USER) == 0;
1135
#endif
1136 1137
}

1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158
const char *lxcContainerGetAlt32bitArch(const char *arch)
{
    /* Any Linux 64bit arch which has a 32bit
     * personality available should be listed here */
    if (STREQ(arch, "x86_64"))
        return "i686";
    if (STREQ(arch, "s390x"))
        return "s390";
    if (STREQ(arch, "ppc64"))
        return "ppc";
    if (STREQ(arch, "parisc64"))
        return "parisc";
    if (STREQ(arch, "sparc64"))
        return "sparc";
    if (STREQ(arch, "mips64"))
        return "mips";

    return NULL;
}


1159 1160
/**
 * lxcContainerStart:
1161 1162 1163 1164 1165
 * @def: pointer to virtual machine structure
 * @nveths: number of interfaces
 * @veths: interface names
 * @control: control FD to the container
 * @ttyPath: path of tty to set as the container console
1166 1167 1168 1169 1170
 *
 * Starts a container process by calling clone() with the namespace flags
 *
 * Returns PID of container on success or -1 in case of error
 */
1171
int lxcContainerStart(virDomainDefPtr def,
1172 1173
                      unsigned int nveths,
                      char **veths,
1174
                      int control,
1175
                      int handshakefd,
1176 1177 1178
                      char *ttyPath)
{
    pid_t pid;
E
Eric Blake 已提交
1179
    int cflags;
1180 1181
    int stacksize = getpagesize() * 4;
    char *stack, *stacktop;
1182 1183
    lxc_child_argv_t args = { def, nveths, veths, control, ttyPath,
                              handshakefd};
1184 1185 1186

    /* allocate a stack for the container */
    if (VIR_ALLOC_N(stack, stacksize) < 0) {
1187
        virReportOOMError();
1188 1189 1190 1191
        return -1;
    }
    stacktop = stack + stacksize;

E
Eric Blake 已提交
1192
    cflags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWIPC|SIGCHLD;
1193

1194
    if (userns_supported()) {
1195
        VIR_DEBUG("Enable user namespaces");
E
Eric Blake 已提交
1196
        cflags |= CLONE_NEWUSER;
1197
    }
1198

1199
    if (def->nets != NULL) {
1200
        VIR_DEBUG("Enable network namespaces");
E
Eric Blake 已提交
1201
        cflags |= CLONE_NEWNET;
1202
    }
1203

E
Eric Blake 已提交
1204
    pid = clone(lxcContainerChild, stacktop, cflags, &args);
1205
    VIR_FREE(stack);
1206
    VIR_DEBUG("clone() completed, new container PID is %d", pid);
1207 1208

    if (pid < 0) {
1209
        virReportSystemError(errno, "%s",
1210
                             _("Failed to run clone container"));
1211 1212 1213 1214 1215 1216
        return -1;
    }

    return pid;
}

1217 1218
ATTRIBUTE_NORETURN static int
lxcContainerDummyChild(void *argv ATTRIBUTE_UNUSED)
1219 1220 1221 1222 1223 1224
{
    _exit(0);
}

int lxcContainerAvailable(int features)
{
1225
    int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|
1226 1227 1228 1229 1230 1231
        CLONE_NEWIPC|SIGCHLD;
    int cpid;
    char *childStack;
    char *stack;
    int childStatus;

1232 1233 1234
    if (features & LXC_CONTAINER_FEATURE_USER)
        flags |= CLONE_NEWUSER;

1235 1236 1237 1238
    if (features & LXC_CONTAINER_FEATURE_NET)
        flags |= CLONE_NEWNET;

    if (VIR_ALLOC_N(stack, getpagesize() * 4) < 0) {
1239
        VIR_DEBUG("Unable to allocate stack");
1240 1241 1242 1243 1244 1245 1246 1247
        return -1;
    }

    childStack = stack + (getpagesize() * 4);

    cpid = clone(lxcContainerDummyChild, childStack, flags, NULL);
    VIR_FREE(stack);
    if (cpid < 0) {
1248
        char ebuf[1024] ATTRIBUTE_UNUSED;
1249
        VIR_DEBUG("clone call returned %s, container support is not enabled",
1250
              virStrerror(errno, ebuf, sizeof ebuf));
1251 1252 1253 1254 1255
        return -1;
    } else {
        waitpid(cpid, &childStatus, 0);
    }

1256
    VIR_DEBUG("Mounted all filesystems");
1257
    return 0;
1258
}