lxc_container.c 17.6 KB
Newer Older
1 2
/*
 * Copyright IBM Corp. 2008
3
 * Copyright Red Hat 2008
4 5 6 7 8
 *
 * lxc_container.c: file description
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
9
 *  Daniel P. Berrange <berrange@redhat.com>
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <config.h>

#include <fcntl.h>
#include <limits.h>
#include <stdlib.h>
31
#include <stdio.h>
32 33
#include <sys/ioctl.h>
#include <sys/mount.h>
34
#include <sys/wait.h>
35
#include <unistd.h>
36 37 38 39 40 41 42
#include <mntent.h>

/* Yes, we want linux private one, for _syscall2() macro */
#include <linux/unistd.h>

/* For MS_MOVE */
#include <linux/fs.h>
43 44 45

#include "lxc_container.h"
#include "util.h"
46
#include "memory.h"
47
#include "veth.h"
48 49 50 51

#define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__)
#define DEBUG0(msg) VIR_DEBUG(__FILE__, "%s", msg)

52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
/*
 * GLibc headers are behind the kernel, so we define these
 * constants if they're not present already.
 */

#ifndef CLONE_NEWPID
#define CLONE_NEWPID  0x20000000
#endif
#ifndef CLONE_NEWUTS
#define CLONE_NEWUTS  0x04000000
#endif
#ifndef CLONE_NEWUSER
#define CLONE_NEWUSER 0x10000000
#endif
#ifndef CLONE_NEWIPC
#define CLONE_NEWIPC  0x08000000
#endif
#ifndef CLONE_NEWNET
#define CLONE_NEWNET  0x40000000 /* New network namespace */
#endif

/* messages between parent and container */
typedef char lxc_message_t;
#define LXC_CONTINUE_MSG 'c'

typedef struct __lxc_child_argv lxc_child_argv_t;
struct __lxc_child_argv {
79
    virDomainDefPtr config;
80 81
    unsigned int nveths;
    char **veths;
82 83 84 85 86
    int monitor;
    char *ttyPath;
};


87
/**
88
 * lxcContainerExecInit:
89 90
 * @vmDef: Ptr to vm definition structure
 *
91
 * Exec the container init string. The container init will replace then
92 93
 * be running in the current process
 *
94
 * Does not return
95
 */
96
static int lxcContainerExecInit(virDomainDefPtr vmDef)
97
{
98
    const char *const argv[] = {
99
        vmDef->os.init,
100 101
        NULL,
    };
102

103
    return execve(argv[0], (char **)argv, NULL);
104 105 106
}

/**
107 108 109
 * lxcContainerSetStdio:
 * @control: the conrol FD
 * @ttyPath: Name of tty to set as the container console
110 111 112 113 114 115
 *
 * Sets the given tty as the primary conosole for the container as well as
 * stdout, stdin and stderr.
 *
 * Returns 0 on success or -1 in case of error
 */
116
static int lxcContainerSetStdio(int control, int ttyfd)
117 118
{
    int rc = -1;
119
    int open_max, i;
120 121 122 123

    if (setsid() < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("setsid failed: %s"), strerror(errno));
124
        goto cleanup;
125 126 127 128 129 130 131 132
    }

    if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("ioctl(TIOCSTTY) failed: %s"), strerror(errno));
        goto cleanup;
    }

133 134 135 136
    /* Just in case someone forget to set FD_CLOEXEC, explicitly
     * close all FDs before executing the container */
    open_max = sysconf (_SC_OPEN_MAX);
    for (i = 0; i < open_max; i++)
137
        if (i != ttyfd && i != control)
138
            close(i);
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164

    if (dup2(ttyfd, 0) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("dup2(stdin) failed: %s"), strerror(errno));
        goto cleanup;
    }

    if (dup2(ttyfd, 1) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("dup2(stdout) failed: %s"), strerror(errno));
        goto cleanup;
    }

    if (dup2(ttyfd, 2) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("dup2(stderr) failed: %s"), strerror(errno));
        goto cleanup;
    }

    rc = 0;

cleanup:
    return rc;
}

/**
165 166
 * lxcContainerSendContinue:
 * @monitor: control FD to child
167
 *
168 169
 * Sends the continue message via the socket pair stored in the vm
 * structure.
170 171 172
 *
 * Returns 0 on success or -1 in case of error
 */
173
int lxcContainerSendContinue(int control)
174 175
{
    int rc = -1;
176 177
    lxc_message_t msg = LXC_CONTINUE_MSG;
    int writeCount = 0;
178

179 180
    writeCount = safewrite(control, &msg, sizeof(msg));
    if (writeCount != sizeof(msg)) {
181
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
182 183 184
                 _("unable to send container continue message: %s"),
                 strerror(errno));
        goto error_out;
185 186
    }

187
    rc = 0;
188

189 190
error_out:
    return rc;
191 192
}

193
/**
194 195
 * lxcContainerWaitForContinue:
 * @control: control FD from parent
196 197 198 199 200 201 202
 *
 * This function will wait for the container continue message from the
 * parent process.  It will send this message on the socket pair stored in
 * the vm structure once it has completed the post clone container setup.
 *
 * Returns 0 on success or -1 in case of error
 */
203
static int lxcContainerWaitForContinue(int control)
204 205 206 207
{
    lxc_message_t msg;
    int readLen;

208
    readLen = saferead(control, &msg, sizeof(msg));
209 210
    if (readLen != sizeof(msg) ||
        msg != LXC_CONTINUE_MSG) {
211 212 213
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("Failed to read the container continue message: %s"),
                 strerror(errno));
214
        return -1;
215
    }
216
    close(control);
217 218 219

    DEBUG0("Received container continue message");

220
    return 0;
221 222
}

223

224 225 226 227 228 229 230 231
/**
 * lxcEnableInterfaces:
 * @vm: Pointer to vm structure
 *
 * This function will enable the interfaces for this container.
 *
 * Returns 0 on success or nonzero in case of error
 */
232 233
static int lxcContainerEnableInterfaces(unsigned int nveths,
                                        char **veths)
234 235
{
    int rc = 0;
236
    unsigned int i;
237

238 239 240
    for (i = 0 ; i < nveths ; i++) {
        DEBUG("Enabling %s", veths[i]);
        rc =  vethInterfaceUpOrDown(veths[i], 1);
241 242 243 244 245 246
        if (0 != rc) {
            goto error_out;
        }
    }

    /* enable lo device only if there were other net devices */
247
    if (veths)
248 249 250 251 252 253
        rc = vethInterfaceUpOrDown("lo", 1);

error_out:
    return rc;
}

254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322

//_syscall2(int, pivot_root, char *, newroot, const char *, oldroot)
extern int pivot_root(const char * new_root,const char * put_old);

static int lxcContainerChildMountSort(const void *a, const void *b)
{
  const char **sa = (const char**)a;
  const char **sb = (const char**)b;

  /* Delibrately reversed args - we need to unmount deepest
     children first */
  return strcmp(*sb, *sa);
}

static int lxcContainerPivotRoot(virDomainFSDefPtr root)
{
    char *oldroot;

    /* First step is to ensure the new root itself is
       a mount point */
    if (mount(root->src, root->src, NULL, MS_BIND, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to bind new root %s: %s"),
                 root->src, strerror(errno));
        return -1;
    }

    if (asprintf(&oldroot, "%s/.oldroot", root->src) < 0) {
        oldroot = NULL;
        lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
        return -1;
    }

    if (virFileMakePath(oldroot) < 0) {
        VIR_FREE(oldroot);
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to create %s: %s"),
                 oldroot, strerror(errno));
        return -1;
    }

    /* The old root directory will live at /.oldroot after
     * this and will soon be unmounted completely */
    if (pivot_root(root->src, oldroot) < 0) {
        VIR_FREE(oldroot);
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to pivot root %s to %s: %s"),
                 oldroot, root->src, strerror(errno));
        return -1;
    }
    VIR_FREE(oldroot);

    /* CWD is undefined after pivot_root, so go to / */
    if (chdir("/") < 0) {
        return -1;
    }

    return 0;
}

static int lxcContainerPopulateDevices(void)
{
    int i;
    const struct {
        int maj;
        int min;
        mode_t mode;
        const char *path;
    } devs[] = {
D
Dan Smith 已提交
323 324 325 326 327 328
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" },
        { LXC_DEV_MAJ_TTY, LXC_DEV_MIN_CONSOLE, 0600, "/dev/console" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" },
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
    };

    if (virFileMakePath("/dev") < 0 ||
        mount("none", "/dev", "tmpfs", 0, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to mount /dev tmpfs for container: %s"),
                 strerror(errno));
        return -1;
    }
    /* Move old devpts into container, since we have to
       connect to the master ptmx which was opened in
       the parent.
       XXX This sucks, we need to figure out how to get our
       own private devpts for isolation
    */
    if (virFileMakePath("/dev/pts") < 0 ||
        mount("/.oldroot/dev/pts", "/dev/pts", NULL,
              MS_MOVE, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to move /dev/pts into container: %s"),
                 strerror(errno));
        return -1;
    }

    /* Populate /dev/ with a few important bits */
    for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) {
        dev_t dev = makedev(devs[i].maj, devs[i].min);
        if (mknod(devs[i].path, 0, dev) < 0 ||
            chmod(devs[i].path, devs[i].mode)) {
            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                     _("failed to make device %s: %s"),
                     devs[i].path, strerror(errno));
            return -1;
        }
    }

    return 0;
}


static int lxcContainerMountNewFS(virDomainDefPtr vmDef)
{
    virDomainFSDefPtr tmp;

    /* Pull in rest of container's mounts */
    for (tmp = vmDef->fss; tmp; tmp = tmp->next) {
        char *src;
        if (STREQ(tmp->dst, "/"))
            continue;
        // XXX fix
        if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
            continue;

        if (asprintf(&src, "/.oldroot/%s", tmp->src) < 0) {
            lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
            return -1;
        }

        if (virFileMakePath(tmp->dst) < 0 ||
            mount(src, tmp->dst, NULL, MS_BIND, NULL) < 0) {
            VIR_FREE(src);
            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                     _("failed to mount %s at %s for container: %s"),
                     tmp->src, tmp->dst, strerror(errno));
            return -1;
        }
        VIR_FREE(src);
    }

    return 0;
}


static int lxcContainerUnmountOldFS(void)
{
    struct mntent *mntent;
    char **mounts = NULL;
    int nmounts = 0;
    FILE *procmnt;
    int i;

    if (!(procmnt = setmntent("/proc/mounts", "r"))) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to read /proc/mounts: %s"),
                 strerror(errno));
        return -1;
    }
    while ((mntent = getmntent(procmnt)) != NULL) {
        if (!STRPREFIX(mntent->mnt_dir, "/.oldroot"))
            continue;

        if (VIR_REALLOC_N(mounts, nmounts+1) < 0) {
            endmntent(procmnt);
            lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
            return -1;
        }
        if (!(mounts[nmounts++] = strdup(mntent->mnt_dir))) {
            endmntent(procmnt);
            lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
            return -1;
        }
    }
    endmntent(procmnt);

    qsort(mounts, nmounts, sizeof(mounts[0]),
          lxcContainerChildMountSort);

    for (i = 0 ; i < nmounts ; i++) {
        if (umount(mounts[i]) < 0) {
            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                     _("failed to unmount %s: %s"),
                     mounts[i], strerror(errno));
            return -1;
        }
        VIR_FREE(mounts[i]);
    }
    VIR_FREE(mounts);

    return 0;
}


/* Got a FS mapped to /, we're going the pivot_root
 * approach to do a better-chroot-than-chroot
 * this is based on this thread http://lkml.org/lkml/2008/3/5/29
 */
static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef,
                                      virDomainFSDefPtr root)
{
    if (lxcContainerPivotRoot(root) < 0)
        return -1;

    if (virFileMakePath("/proc") < 0 ||
        mount("none", "/proc", "proc", 0, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to mount /proc for container: %s"),
                 strerror(errno));
        return -1;
    }

    if (lxcContainerPopulateDevices() < 0)
        return -1;

    if (lxcContainerMountNewFS(vmDef) < 0)
        return -1;

    if (lxcContainerUnmountOldFS() < 0)
        return -1;

    return 0;
}

/* Nothing mapped to /, we're using the main root,
   but with extra stuff mapped in */
static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef)
{
    virDomainFSDefPtr tmp;

    for (tmp = vmDef->fss; tmp; tmp = tmp->next) {
        // XXX fix to support other mount types
        if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
            continue;

        if (mount(tmp->src,
                  tmp->dst,
                  NULL,
                  MS_BIND,
                  NULL) < 0) {
            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                     _("failed to mount %s at %s for container: %s"),
                     tmp->src, tmp->dst, strerror(errno));
            return -1;
        }
    }

    /* mount /proc */
    if (mount("lxcproc", "/proc", "proc", 0, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to mount /proc for container: %s"),
                 strerror(errno));
        return -1;
    }

    return 0;
}

static int lxcContainerSetupMounts(virDomainDefPtr vmDef)
{
    virDomainFSDefPtr tmp;
    virDomainFSDefPtr root = NULL;

    for (tmp = vmDef->fss; tmp && !root; tmp = tmp->next) {
        if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
            continue;
        if (STREQ(tmp->dst, "/"))
            root = tmp;
    }

    if (root)
        return lxcContainerSetupPivotRoot(vmDef, root);
    else
        return lxcContainerSetupExtraMounts(vmDef);
}

533 534 535 536 537 538 539 540 541 542 543 544
/**
 * lxcChild:
 * @argv: Pointer to container arguments
 *
 * This function is run in the process clone()'d in lxcStartContainer.
 * Perform a number of container setup tasks:
 *     Setup container file system
 *     mount container /proca
 * Then exec's the container init
 *
 * Returns 0 on success or -1 in case of error
 */
545
static int lxcContainerChild( void *data )
546
{
547
    lxc_child_argv_t *argv = data;
548
    virDomainDefPtr vmDef = argv->config;
549
    int ttyfd;
550 551 552 553

    if (NULL == vmDef) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("lxcChild() passed invalid vm definition"));
554
        return -1;
555 556
    }

557 558
    if (lxcContainerSetupMounts(vmDef) < 0)
        return -1;
559

560 561
    ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY);
    if (ttyfd < 0) {
562
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
563
                 _("open(%s) failed: %s"), argv->ttyPath, strerror(errno));
564
        return -1;
565 566
    }

567 568
    if (lxcContainerSetStdio(argv->monitor, ttyfd) < 0) {
        close(ttyfd);
569
        return -1;
570 571
    }
    close(ttyfd);
572

573
    /* Wait for interface devices to show up */
574 575
    if (lxcContainerWaitForContinue(argv->monitor) < 0)
        return -1;
576 577

    /* enable interfaces */
578
    if (lxcContainerEnableInterfaces(argv->nveths, argv->veths) < 0)
579
        return -1;
580

581
    /* this function will only return if an error occured */
582 583
    return lxcContainerExecInit(vmDef);
}
584

585 586 587 588 589 590 591 592 593
/**
 * lxcContainerStart:
 * @driver: pointer to driver structure
 * @vm: pointer to virtual machine structure
 *
 * Starts a container process by calling clone() with the namespace flags
 *
 * Returns PID of container on success or -1 in case of error
 */
594
int lxcContainerStart(virDomainDefPtr def,
595 596
                      unsigned int nveths,
                      char **veths,
597 598 599 600 601 602 603
                      int control,
                      char *ttyPath)
{
    pid_t pid;
    int flags;
    int stacksize = getpagesize() * 4;
    char *stack, *stacktop;
604
    lxc_child_argv_t args = { def, nveths, veths, control, ttyPath };
605 606 607

    /* allocate a stack for the container */
    if (VIR_ALLOC_N(stack, stacksize) < 0) {
608
        lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
609 610 611 612 613 614 615 616 617 618 619 620 621 622
        return -1;
    }
    stacktop = stack + stacksize;

    flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER|CLONE_NEWIPC|SIGCHLD;

    if (def->nets != NULL)
        flags |= CLONE_NEWNET;

    pid = clone(lxcContainerChild, stacktop, flags, &args);
    VIR_FREE(stack);
    DEBUG("clone() returned, %d", pid);

    if (pid < 0) {
623
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665
                 _("clone() failed, %s"), strerror(errno));
        return -1;
    }

    return pid;
}

static int lxcContainerDummyChild(void *argv ATTRIBUTE_UNUSED)
{
    _exit(0);
}

int lxcContainerAvailable(int features)
{
    int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER|
        CLONE_NEWIPC|SIGCHLD;
    int cpid;
    char *childStack;
    char *stack;
    int childStatus;

    if (features & LXC_CONTAINER_FEATURE_NET)
        flags |= CLONE_NEWNET;

    if (VIR_ALLOC_N(stack, getpagesize() * 4) < 0) {
        DEBUG0("Unable to allocate stack");
        return -1;
    }

    childStack = stack + (getpagesize() * 4);

    cpid = clone(lxcContainerDummyChild, childStack, flags, NULL);
    VIR_FREE(stack);
    if (cpid < 0) {
        DEBUG("clone call returned %s, container support is not enabled",
              strerror(errno));
        return -1;
    } else {
        waitpid(cpid, &childStatus, 0);
    }

    return 0;
666 667
}