lxc_container.c 10.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
/*
 * Copyright IBM Corp. 2008
 *
 * lxc_container.c: file description
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <config.h>

#ifdef WITH_LXC

#include <fcntl.h>
#include <limits.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
33
#include <sys/wait.h>
34 35 36 37
#include <unistd.h>

#include "lxc_container.h"
#include "util.h"
38
#include "memory.h"
39
#include "veth.h"
40 41 42 43

#define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__)
#define DEBUG0(msg) VIR_DEBUG(__FILE__, "%s", msg)

44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
/*
 * GLibc headers are behind the kernel, so we define these
 * constants if they're not present already.
 */

#ifndef CLONE_NEWPID
#define CLONE_NEWPID  0x20000000
#endif
#ifndef CLONE_NEWUTS
#define CLONE_NEWUTS  0x04000000
#endif
#ifndef CLONE_NEWUSER
#define CLONE_NEWUSER 0x10000000
#endif
#ifndef CLONE_NEWIPC
#define CLONE_NEWIPC  0x08000000
#endif
#ifndef CLONE_NEWNET
#define CLONE_NEWNET  0x40000000 /* New network namespace */
#endif

/* messages between parent and container */
typedef char lxc_message_t;
#define LXC_CONTINUE_MSG 'c'

typedef struct __lxc_child_argv lxc_child_argv_t;
struct __lxc_child_argv {
    lxc_vm_def_t *config;
    int monitor;
    char *ttyPath;
};


77
/**
78
 * lxcContainerExecInit:
79 80
 * @vmDef: Ptr to vm definition structure
 *
81
 * Exec the container init string. The container init will replace then
82 83
 * be running in the current process
 *
84
 * Does not return
85
 */
86
static int lxcContainerExecInit(const lxc_vm_def_t *vmDef)
87
{
88 89 90 91
    const char *const argv[] = {
        vmDef->init,
        NULL,
    };
92

93
    return execve(argv[0], (char **)argv, NULL);
94 95 96
}

/**
97 98 99
 * lxcContainerSetStdio:
 * @control: the conrol FD
 * @ttyPath: Name of tty to set as the container console
100 101 102 103 104 105
 *
 * Sets the given tty as the primary conosole for the container as well as
 * stdout, stdin and stderr.
 *
 * Returns 0 on success or -1 in case of error
 */
106
static int lxcContainerSetStdio(int control, const char *ttyPath)
107 108 109
{
    int rc = -1;
    int ttyfd;
110
    int open_max, i;
111 112 113 114 115 116 117

    if (setsid() < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("setsid failed: %s"), strerror(errno));
        goto error_out;
    }

118
    ttyfd = open(ttyPath, O_RDWR|O_NOCTTY);
119 120
    if (ttyfd < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
121
                 _("open(%s) failed: %s"), ttyPath, strerror(errno));
122 123 124 125 126 127 128 129 130
        goto error_out;
    }

    if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("ioctl(TIOCSTTY) failed: %s"), strerror(errno));
        goto cleanup;
    }

131 132 133 134
    /* Just in case someone forget to set FD_CLOEXEC, explicitly
     * close all FDs before executing the container */
    open_max = sysconf (_SC_OPEN_MAX);
    for (i = 0; i < open_max; i++)
135
        if (i != ttyfd && i != control)
136
            close(i);
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165

    if (dup2(ttyfd, 0) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("dup2(stdin) failed: %s"), strerror(errno));
        goto cleanup;
    }

    if (dup2(ttyfd, 1) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("dup2(stdout) failed: %s"), strerror(errno));
        goto cleanup;
    }

    if (dup2(ttyfd, 2) < 0) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("dup2(stderr) failed: %s"), strerror(errno));
        goto cleanup;
    }

    rc = 0;

cleanup:
    close(ttyfd);

error_out:
    return rc;
}

/**
166 167
 * lxcContainerSendContinue:
 * @monitor: control FD to child
168
 *
169 170
 * Sends the continue message via the socket pair stored in the vm
 * structure.
171 172 173
 *
 * Returns 0 on success or -1 in case of error
 */
174 175
int lxcContainerSendContinue(virConnectPtr conn,
                             int control)
176 177
{
    int rc = -1;
178 179
    lxc_message_t msg = LXC_CONTINUE_MSG;
    int writeCount = 0;
180

181 182 183 184 185 186
    writeCount = safewrite(control, &msg, sizeof(msg));
    if (writeCount != sizeof(msg)) {
        lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("unable to send container continue message: %s"),
                 strerror(errno));
        goto error_out;
187 188
    }

189
    rc = 0;
190

191 192
error_out:
    return rc;
193 194
}

195
/**
196 197
 * lxcContainerWaitForContinue:
 * @control: control FD from parent
198 199 200 201 202 203 204
 *
 * This function will wait for the container continue message from the
 * parent process.  It will send this message on the socket pair stored in
 * the vm structure once it has completed the post clone container setup.
 *
 * Returns 0 on success or -1 in case of error
 */
205
static int lxcContainerWaitForContinue(int control)
206 207 208 209
{
    lxc_message_t msg;
    int readLen;

210
    readLen = saferead(control, &msg, sizeof(msg));
211 212
    if (readLen != sizeof(msg) ||
        msg != LXC_CONTINUE_MSG) {
213 214 215
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("Failed to read the container continue message: %s"),
                 strerror(errno));
216
        return -1;
217
    }
218
    close(control);
219 220 221

    DEBUG0("Received container continue message");

222
    return 0;
223 224 225 226 227 228 229 230 231 232
}

/**
 * lxcEnableInterfaces:
 * @vm: Pointer to vm structure
 *
 * This function will enable the interfaces for this container.
 *
 * Returns 0 on success or nonzero in case of error
 */
233
static int lxcContainerEnableInterfaces(const lxc_vm_def_t *def)
234 235 236 237
{
    int rc = 0;
    const lxc_net_def_t *net;

238
    for (net = def->nets; net; net = net->next) {
239 240 241 242 243 244 245 246
        DEBUG("Enabling %s", net->containerVeth);
        rc =  vethInterfaceUpOrDown(net->containerVeth, 1);
        if (0 != rc) {
            goto error_out;
        }
    }

    /* enable lo device only if there were other net devices */
247
    if (def->nets)
248 249 250 251 252 253
        rc = vethInterfaceUpOrDown("lo", 1);

error_out:
    return rc;
}

254 255 256 257 258 259 260 261 262 263 264 265
/**
 * lxcChild:
 * @argv: Pointer to container arguments
 *
 * This function is run in the process clone()'d in lxcStartContainer.
 * Perform a number of container setup tasks:
 *     Setup container file system
 *     mount container /proca
 * Then exec's the container init
 *
 * Returns 0 on success or -1 in case of error
 */
266
static int lxcContainerChild( void *data )
267 268
{
    int rc = -1;
269 270
    lxc_child_argv_t *argv = data;
    lxc_vm_def_t *vmDef = argv->config;
271 272 273 274 275 276
    lxc_mount_t *curMount;
    int i;

    if (NULL == vmDef) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("lxcChild() passed invalid vm definition"));
277
        return -1;
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
    }

    /* handle the bind mounts first before doing anything else that may */
    /* then access those mounted dirs */
    curMount = vmDef->mounts;
    for (i = 0; curMount; curMount = curMount->next) {
        rc = mount(curMount->source,
                   curMount->target,
                   NULL,
                   MS_BIND,
                   NULL);
        if (0 != rc) {
            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                     _("failed to mount %s at %s for container: %s"),
                     curMount->source, curMount->target, strerror(errno));
293
            return -1;
294 295 296 297 298 299 300 301 302
        }
    }

    /* mount /proc */
    rc = mount("lxcproc", "/proc", "proc", 0, NULL);
    if (0 != rc) {
        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("failed to mount /proc for container: %s"),
                 strerror(errno));
303
        return -1;
304 305
    }

306 307 308
    if (lxcContainerSetStdio(argv->monitor, argv->ttyPath) < 0)
        return -1;

309
    /* Wait for interface devices to show up */
310 311
    if (lxcContainerWaitForContinue(argv->monitor) < 0)
        return -1;
312 313

    /* enable interfaces */
314 315
    if (lxcContainerEnableInterfaces(vmDef) < 0)
        return -1;
316

317
    /* this function will only return if an error occured */
318 319
    return lxcContainerExecInit(vmDef);
}
320

321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
/**
 * lxcContainerStart:
 * @conn: pointer to connection
 * @driver: pointer to driver structure
 * @vm: pointer to virtual machine structure
 *
 * Starts a container process by calling clone() with the namespace flags
 *
 * Returns PID of container on success or -1 in case of error
 */
int lxcContainerStart(virConnectPtr conn,
                      lxc_vm_def_t *def,
                      int control,
                      char *ttyPath)
{
    pid_t pid;
    int flags;
    int stacksize = getpagesize() * 4;
    char *stack, *stacktop;
    lxc_child_argv_t args = { def, control, ttyPath };

    /* allocate a stack for the container */
    if (VIR_ALLOC_N(stack, stacksize) < 0) {
        lxcError(conn, NULL, VIR_ERR_NO_MEMORY,
                 _("unable to allocate container stack"));
        return -1;
    }
    stacktop = stack + stacksize;

    flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER|CLONE_NEWIPC|SIGCHLD;

    if (def->nets != NULL)
        flags |= CLONE_NEWNET;

    pid = clone(lxcContainerChild, stacktop, flags, &args);
    VIR_FREE(stack);
    DEBUG("clone() returned, %d", pid);

    if (pid < 0) {
        lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR,
                 _("clone() failed, %s"), strerror(errno));
        return -1;
    }

    return pid;
}

static int lxcContainerDummyChild(void *argv ATTRIBUTE_UNUSED)
{
    _exit(0);
}

int lxcContainerAvailable(int features)
{
    int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER|
        CLONE_NEWIPC|SIGCHLD;
    int cpid;
    char *childStack;
    char *stack;
    int childStatus;

    if (features & LXC_CONTAINER_FEATURE_NET)
        flags |= CLONE_NEWNET;

    if (VIR_ALLOC_N(stack, getpagesize() * 4) < 0) {
        DEBUG0("Unable to allocate stack");
        return -1;
    }

    childStack = stack + (getpagesize() * 4);

    cpid = clone(lxcContainerDummyChild, childStack, flags, NULL);
    VIR_FREE(stack);
    if (cpid < 0) {
        DEBUG("clone call returned %s, container support is not enabled",
              strerror(errno));
        return -1;
    } else {
        waitpid(cpid, &childStatus, 0);
    }

    return 0;
403 404 405
}

#endif /* WITH_LXC */