lxc_controller.c 27.1 KB
Newer Older
1
/*
2 3
 * Copyright (C) 2010-2011 Red Hat, Inc.
 * Copyright IBM Corp. 2008
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 *
 * lxc_controller.c: linux container process controller
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <config.h>

#include <sys/epoll.h>
28 29
#include <sys/wait.h>
#include <sys/socket.h>
30 31
#include <sys/types.h>
#include <sys/un.h>
32 33
#include <sys/utsname.h>
#include <sys/personality.h>
34
#include <unistd.h>
35
#include <paths.h>
36
#include <errno.h>
37 38
#include <fcntl.h>
#include <signal.h>
39
#include <getopt.h>
40
#include <sys/mount.h>
E
Eric Blake 已提交
41
#include <locale.h>
42

D
Daniel P. Berrange 已提交
43
#if HAVE_CAPNG
44
# include <cap-ng.h>
D
Daniel P. Berrange 已提交
45 46
#endif

47
#include "virterror_internal.h"
48
#include "logging.h"
49 50 51
#include "util.h"

#include "lxc_conf.h"
52 53 54 55
#include "lxc_container.h"
#include "veth.h"
#include "memory.h"
#include "util.h"
56
#include "files.h"
57

58 59
#define VIR_FROM_THIS VIR_FROM_LXC

D
Dan Smith 已提交
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
struct cgroup_device_policy {
    char type;
    int major;
    int minor;
};

/**
 * lxcSetContainerResources
 * @def: pointer to virtual machine structure
 *
 * Creates a cgroup for the container, moves the task inside,
 * and sets resource limits
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcSetContainerResources(virDomainDefPtr def)
{
77
    virCgroupPtr driver;
D
Dan Smith 已提交
78 79 80 81 82 83 84 85 86
    virCgroupPtr cgroup;
    int rc = -1;
    int i;
    struct cgroup_device_policy devices[] = {
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM},
87
        {'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_TTY},
88
        {'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX},
D
Dan Smith 已提交
89 90
        {0,   0, 0}};

91 92
    rc = virCgroupForDriver("lxc", &driver, 1, 0);
    if (rc != 0) {
93 94 95 96
        /* Skip all if no driver cgroup is configured */
        if (rc == -ENXIO || rc == -ENOENT)
            return 0;

97
        virReportSystemError(-rc, "%s",
98
                             _("Unable to get cgroup for driver"));
99 100
        return rc;
    }
D
Dan Smith 已提交
101

102
    rc = virCgroupForDomain(driver, def->name, &cgroup, 1);
D
Dan Smith 已提交
103
    if (rc != 0) {
104
        virReportSystemError(-rc,
105 106
                             _("Unable to create cgroup for domain %s"),
                             def->name);
107
        goto cleanup;
D
Dan Smith 已提交
108 109
    }

110 111 112 113 114 115 116 117 118 119
    if (def->blkio.weight) {
        rc = virCgroupSetBlkioWeight(cgroup, def->blkio.weight);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set Blkio weight for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

120
    rc = virCgroupSetMemory(cgroup, def->mem.max_balloon);
121
    if (rc != 0) {
122
        virReportSystemError(-rc,
123 124
                             _("Unable to set memory limit for domain %s"),
                             def->name);
125
        goto cleanup;
126
    }
D
Dan Smith 已提交
127

128
    if (def->mem.hard_limit) {
129 130 131 132 133 134 135 136 137
        rc = virCgroupSetMemoryHardLimit(cgroup, def->mem.hard_limit);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set memory hard limit for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

138
    if (def->mem.soft_limit) {
139 140 141 142 143 144 145 146 147
        rc = virCgroupSetMemorySoftLimit(cgroup, def->mem.soft_limit);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set memory soft limit for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

148
    if (def->mem.swap_hard_limit) {
149
        rc = virCgroupSetMemSwapHardLimit(cgroup, def->mem.swap_hard_limit);
150 151 152 153 154 155 156 157
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set swap hard limit for domain %s"),
                                 def->name);
            goto cleanup;
        }
    }

D
Dan Smith 已提交
158
    rc = virCgroupDenyAllDevices(cgroup);
159
    if (rc != 0) {
160
        virReportSystemError(-rc,
161 162 163 164
                             _("Unable to deny devices for domain %s"),
                             def->name);
        goto cleanup;
    }
D
Dan Smith 已提交
165 166 167 168 169 170

    for (i = 0; devices[i].type != 0; i++) {
        struct cgroup_device_policy *dev = &devices[i];
        rc = virCgroupAllowDevice(cgroup,
                                  dev->type,
                                  dev->major,
171 172
                                  dev->minor,
                                  VIR_CGROUP_DEVICE_RWM);
173
        if (rc != 0) {
174
            virReportSystemError(-rc,
175 176 177 178
                                 _("Unable to allow device %c:%d:%d for domain %s"),
                                 dev->type, dev->major, dev->minor, def->name);
            goto cleanup;
        }
D
Dan Smith 已提交
179 180
    }

181 182
    rc = virCgroupAllowDeviceMajor(cgroup, 'c', LXC_DEV_MAJ_PTY,
                                   VIR_CGROUP_DEVICE_RWM);
183
    if (rc != 0) {
184
        virReportSystemError(-rc,
185 186 187 188
                             _("Unable to allow PYT devices for domain %s"),
                             def->name);
        goto cleanup;
    }
189

D
Dan Smith 已提交
190 191
    rc = virCgroupAddTask(cgroup, getpid());
    if (rc != 0) {
192
        virReportSystemError(-rc,
193 194
                             _("Unable to add task %d to cgroup for domain %s"),
                             getpid(), def->name);
D
Dan Smith 已提交
195 196
    }

197 198
cleanup:
    virCgroupFree(&driver);
D
Dan Smith 已提交
199 200 201 202 203
    virCgroupFree(&cgroup);

    return rc;
}

204 205 206
static char*lxcMonitorPath(virDomainDefPtr def)
{
    char *sockpath;
207 208 209

    if (virAsprintf(&sockpath, "%s/%s.sock",
                    LXC_STATE_DIR, def->name) < 0)
210
        virReportOOMError();
211 212 213 214 215 216 217 218 219
    return sockpath;
}

static int lxcMonitorServer(const char *sockpath)
{
    int fd;
    struct sockaddr_un addr;

    if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) {
220
        virReportSystemError(errno,
221 222
                             _("failed to create server socket '%s'"),
                             sockpath);
223 224 225 226 227 228
        goto error;
    }

    unlink(sockpath);
    memset(&addr, 0, sizeof(addr));
    addr.sun_family = AF_UNIX;
C
Chris Lalancette 已提交
229
    if (virStrcpyStatic(addr.sun_path, sockpath) == NULL) {
230
        lxcError(VIR_ERR_INTERNAL_ERROR,
C
Chris Lalancette 已提交
231 232 233
                 _("Socket path %s too long for destination"), sockpath);
        goto error;
    }
234 235

    if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
236
        virReportSystemError(errno,
237 238
                             _("failed to bind server socket '%s'"),
                             sockpath);
239 240 241
        goto error;
    }
    if (listen(fd, 30 /* backlog */ ) < 0) {
242
        virReportSystemError(errno,
243 244
                             _("failed to listen server socket %s"),
                             sockpath);
245 246 247 248 249 250
        goto error;
    }

    return fd;

error:
251
    VIR_FORCE_CLOSE(fd);
252 253
    return -1;
}
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274

/**
 * lxcFdForward:
 * @readFd: file descriptor to read
 * @writeFd: file desriptor to write
 *
 * Reads 1 byte of data from readFd and writes to writeFd.
 *
 * Returns 0 on success, EAGAIN if returned on read, or -1 in case of error
 */
static int lxcFdForward(int readFd, int writeFd)
{
    int rc = -1;
    char buf[2];

    if (1 != (saferead(readFd, buf, 1))) {
        if (EAGAIN == errno) {
            rc = EAGAIN;
            goto cleanup;
        }

275
        virReportSystemError(errno,
276 277
                             _("read of fd %d failed"),
                             readFd);
278 279 280 281
        goto cleanup;
    }

    if (1 != (safewrite(writeFd, buf, 1))) {
282
        virReportSystemError(errno,
283 284
                             _("write to fd %d failed"),
                             writeFd);
285 286 287 288 289 290 291 292 293
        goto cleanup;
    }

    rc = 0;

cleanup:
    return rc;
}

D
Daniel P. Berrange 已提交
294 295 296 297 298 299 300 301 302

static int lxcControllerClearCapabilities(void)
{
#if HAVE_CAPNG
    int ret;

    capng_clear(CAPNG_SELECT_BOTH);

    if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
303
        lxcError(VIR_ERR_INTERNAL_ERROR,
D
Daniel P. Berrange 已提交
304 305 306 307
                 _("failed to apply capabilities: %d"), ret);
        return -1;
    }
#else
308
    VIR_WARN0("libcap-ng support not compiled in, unable to clear capabilities");
D
Daniel P. Berrange 已提交
309 310 311 312
#endif
    return 0;
}

313 314 315 316 317
typedef struct _lxcTtyForwardFd_t {
    int fd;
    int active;
} lxcTtyForwardFd_t;

318 319 320 321 322 323 324 325 326 327 328
/* Return true if it is ok to ignore an accept-after-epoll syscall
   that fails with the specified errno value.  Else false.  */
static bool
ignorable_epoll_accept_errno(int errnum)
{
  return (errnum == EINVAL
          || errnum == ECONNABORTED
          || errnum == EAGAIN
          || errnum == EWOULDBLOCK);
}

329 330 331 332 333 334 335 336 337 338 339 340
static bool
lxcPidGone(pid_t container)
{
    waitpid(container, NULL, WNOHANG);

    if (kill(container, 0) < 0 &&
        errno == ESRCH)
        return true;

    return false;
}

341
/**
342 343 344 345 346
 * lxcControllerMain
 * @monitor: server socket fd to accept client requests
 * @client: initial client which is the libvirtd daemon
 * @appPty: open fd for application facing Pty
 * @contPty: open fd for container facing Pty
347 348 349 350 351 352 353 354
 *
 * Forwards traffic between fds.  Data read from appPty will be written to contPty
 * This process loops forever.
 * This uses epoll in edge triggered mode to avoid a hard loop on POLLHUP
 * events when the user disconnects the virsh console via ctrl-]
 *
 * Returns 0 on success or -1 in case of error
 */
355 356 357
static int lxcControllerMain(int monitor,
                             int client,
                             int appPty,
358 359
                             int contPty,
                             pid_t container)
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
{
    int rc = -1;
    int epollFd;
    struct epoll_event epollEvent;
    int numEvents;
    int numActive = 0;
    lxcTtyForwardFd_t fdArray[2];
    int timeout = -1;
    int curFdOff = 0;
    int writeFdOff = 0;

    fdArray[0].fd = appPty;
    fdArray[0].active = 0;
    fdArray[1].fd = contPty;
    fdArray[1].active = 0;
375 376 377 378

    VIR_DEBUG("monitor=%d client=%d appPty=%d contPty=%d",
              monitor, client, appPty, contPty);

379 380 381
    /* create the epoll fild descriptor */
    epollFd = epoll_create(2);
    if (0 > epollFd) {
382
        virReportSystemError(errno, "%s",
383
                             _("epoll_create(2) failed"));
384 385 386 387 388 389 390 391
        goto cleanup;
    }

    /* add the file descriptors the epoll fd */
    memset(&epollEvent, 0x00, sizeof(epollEvent));
    epollEvent.events = EPOLLIN|EPOLLET;    /* edge triggered */
    epollEvent.data.fd = appPty;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, appPty, &epollEvent)) {
392
        virReportSystemError(errno, "%s",
393
                             _("epoll_ctl(appPty) failed"));
394 395 396 397
        goto cleanup;
    }
    epollEvent.data.fd = contPty;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, contPty, &epollEvent)) {
398
        virReportSystemError(errno, "%s",
399
                             _("epoll_ctl(contPty) failed"));
400 401 402
        goto cleanup;
    }

403 404 405
    epollEvent.events = EPOLLIN;
    epollEvent.data.fd = monitor;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, monitor, &epollEvent)) {
406
        virReportSystemError(errno, "%s",
407
                             _("epoll_ctl(monitor) failed"));
408 409 410 411 412 413
        goto cleanup;
    }

    epollEvent.events = EPOLLHUP;
    epollEvent.data.fd = client;
    if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) {
414
        virReportSystemError(errno, "%s",
415
                             _("epoll_ctl(client) failed"));
416 417 418
        goto cleanup;
    }

419 420 421 422
    while (1) {
        /* if active fd's, return if no events, else wait forever */
        timeout = (numActive > 0) ? 0 : -1;
        numEvents = epoll_wait(epollFd, &epollEvent, 1, timeout);
423 424 425
        if (numEvents > 0) {
            if (epollEvent.data.fd == monitor) {
                int fd = accept(monitor, NULL, 0);
426 427 428 429 430 431 432 433 434 435 436 437
                if (fd < 0) {
                    /* First reflex may be simply to declare accept failure
                       to be a fatal error.  However, accept may fail when
                       a client quits between the above epoll_wait and here.
                       That case is not fatal, but rather to be expected,
                       if not common, so ignore it.  */
                    if (ignorable_epoll_accept_errno(errno))
                        continue;
                    virReportSystemError(errno, "%s",
                                         _("accept(monitor,...) failed"));
                    goto cleanup;
                }
438
                if (client != -1) { /* Already connected, so kick new one out */
439
                    VIR_FORCE_CLOSE(fd);
440
                    continue;
441
                }
442 443 444 445
                client = fd;
                epollEvent.events = EPOLLHUP;
                epollEvent.data.fd = client;
                if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) {
446
                    virReportSystemError(errno, "%s",
447
                                         _("epoll_ctl(client) failed"));
448 449 450 451
                    goto cleanup;
                }
            } else if (client != -1 && epollEvent.data.fd == client) {
                if (0 > epoll_ctl(epollFd, EPOLL_CTL_DEL, client, &epollEvent)) {
452
                    virReportSystemError(errno, "%s",
453
                                         _("epoll_ctl(client) failed"));
454 455
                    goto cleanup;
                }
456
                VIR_FORCE_CLOSE(client);
457
            } else {
458 459 460 461 462 463 464
                if (epollEvent.events & EPOLLIN) {
                    curFdOff = epollEvent.data.fd == appPty ? 0 : 1;
                    if (!fdArray[curFdOff].active) {
                        fdArray[curFdOff].active = 1;
                        ++numActive;
                    }
                } else if (epollEvent.events & EPOLLHUP) {
465 466 467 468 469 470 471
                    if (lxcPidGone(container))
                        goto cleanup;
                    curFdOff = epollEvent.data.fd == appPty ? 0 : 1;
                    if (fdArray[curFdOff].active) {
                        fdArray[curFdOff].active = 0;
                        --numActive;
                    }
472 473
                    continue;
                } else {
474
                    lxcError(VIR_ERR_INTERNAL_ERROR,
475 476 477
                             _("error event %d"), epollEvent.events);
                    goto cleanup;
                }
478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
            }
        } else if (0 == numEvents) {
            if (2 == numActive) {
                /* both fds active, toggle between the two */
                curFdOff ^= 1;
            } else {
                /* only one active, if current is active, use it, else it */
                /* must be the other one (ie. curFd just went inactive) */
                curFdOff = fdArray[curFdOff].active ? curFdOff : curFdOff ^ 1;
            }

        } else  {
            if (EINTR == errno) {
                continue;
            }

            /* error */
495
            virReportSystemError(errno, "%s",
496
                                 _("epoll_wait() failed"));
497 498 499 500 501 502 503 504 505 506 507 508 509
            goto cleanup;

        }

        if (0 < numActive) {
            writeFdOff = curFdOff ^ 1;
            rc = lxcFdForward(fdArray[curFdOff].fd, fdArray[writeFdOff].fd);

            if (EAGAIN == rc) {
                /* this fd no longer has data, set it as inactive */
                --numActive;
                fdArray[curFdOff].active = 0;
            } else if (-1 == rc) {
510 511 512
                if (lxcPidGone(container))
                    goto cleanup;
                continue;
513 514 515 516 517 518 519 520 521
            }

        }

    }

    rc = 0;

cleanup:
522 523 524
    VIR_FORCE_CLOSE(appPty);
    VIR_FORCE_CLOSE(contPty);
    VIR_FORCE_CLOSE(epollFd);
525 526 527
    return rc;
}

528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545


/**
 * lxcControllerMoveInterfaces
 * @nveths: number of interfaces
 * @veths: interface names
 * @container: pid of container
 *
 * Moves network interfaces into a container's namespace
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcControllerMoveInterfaces(unsigned int nveths,
                                       char **veths,
                                       pid_t container)
{
    unsigned int i;
    for (i = 0 ; i < nveths ; i++)
546
        if (moveInterfaceToNetNs(veths[i], container) < 0)
547 548 549 550 551 552 553 554
            return -1;

    return 0;
}


/**
 * lxcCleanupInterfaces:
555 556
 * @nveths: number of interfaces
 * @veths: interface names
557 558 559 560 561 562 563 564 565 566
 *
 * Cleans up the container interfaces by deleting the veth device pairs.
 *
 * Returns 0 on success or -1 in case of error
 */
static int lxcControllerCleanupInterfaces(unsigned int nveths,
                                          char **veths)
{
    unsigned int i;
    for (i = 0 ; i < nveths ; i++)
567
        vethDelete(veths[i]);
568 569 570 571

    return 0;
}

572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590
static int lxcSetPersonality(virDomainDefPtr def)
{
    struct utsname utsname;
    const char *altArch;

    uname(&utsname);

    altArch = lxcContainerGetAlt32bitArch(utsname.machine);
    if (altArch &&
        STREQ(def->os.arch, altArch)) {
        if (personality(PER_LINUX32) < 0) {
            virReportSystemError(errno, _("Unable to request personality for %s on %s"),
                                 altArch, utsname.machine);
            return -1;
        }
    }
    return 0;
}

591
#ifndef MS_REC
592
# define MS_REC          16384
593 594 595
#endif

#ifndef MS_SLAVE
596
# define MS_SLAVE              (1<<19)
597
#endif
598 599

static int
600
lxcControllerRun(virDomainDefPtr def,
601 602 603 604 605 606 607 608
                 unsigned int nveths,
                 char **veths,
                 int monitor,
                 int client,
                 int appPty)
{
    int rc = -1;
    int control[2] = { -1, -1};
609
    int containerPty = -1;
610
    char *containerPtyPath = NULL;
611
    pid_t container = -1;
612 613 614
    virDomainFSDefPtr root;
    char *devpts = NULL;
    char *devptmx = NULL;
615 616

    if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) {
617
        virReportSystemError(errno, "%s",
618
                             _("sockpair failed"));
619 620 621
        goto cleanup;
    }

622 623
    root = virDomainGetRootFilesystem(def);

624 625 626
    if (lxcSetContainerResources(def) < 0)
        goto cleanup;

627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
    /*
     * If doing a chroot style setup, we need to prepare
     * a private /dev/pts for the child now, which they
     * will later move into position.
     *
     * This is complex because 'virsh console' needs to
     * use /dev/pts from the host OS, and the guest OS
     * needs to use /dev/pts from the guest.
     *
     * This means that we (libvirt_lxc) need to see and
     * use both /dev/pts instances. We're running in the
     * host OS context though and don't want to expose
     * the guest OS /dev/pts there.
     *
     * Thus we call unshare(CLONE_NS) so that we can see
     * the guest's new /dev/pts, without it becoming
     * visible to the host OS. We also put the root FS
     * into slave mode, just in case it was currently
     * marked as shared
     */
    if (root) {
        VIR_DEBUG0("Setting up private /dev/pts");
        if (unshare(CLONE_NEWNS) < 0) {
650
            virReportSystemError(errno, "%s",
651
                                 _("Cannot unshare mount namespace"));
652 653 654 655
            goto cleanup;
        }

        if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
656
            virReportSystemError(errno, "%s",
657
                                 _("Failed to switch root mount into slave mode"));
658 659 660 661 662
            goto cleanup;
        }

        if (virAsprintf(&devpts, "%s/dev/pts", root->src) < 0 ||
            virAsprintf(&devptmx, "%s/dev/pts/ptmx", root->src) < 0) {
663
            virReportOOMError();
664 665 666
            goto cleanup;
        }

L
Laine Stump 已提交
667
        if (virFileMakePath(devpts) != 0) {
668
            virReportSystemError(errno,
669
                                 _("Failed to make path %s"),
670 671 672 673 674
                                 devpts);
            goto cleanup;
        }

        VIR_DEBUG("Mouting 'devpts' on %s", devpts);
675 676
        if (mount("devpts", devpts, "devpts", 0,
                  "newinstance,ptmxmode=0666,mode=0620,gid=5") < 0) {
677
            virReportSystemError(errno,
678
                                 _("Failed to mount devpts on %s"),
679 680 681 682 683
                                 devpts);
            goto cleanup;
        }

        if (access(devptmx, R_OK) < 0) {
684
            VIR_WARN0("Kernel does not support private devpts, using shared devpts");
685 686
            VIR_FREE(devptmx);
        }
687 688
    }

689 690 691 692 693 694
    if (devptmx) {
        VIR_DEBUG("Opening tty on private %s", devptmx);
        if (virFileOpenTtyAt(devptmx,
                             &containerPty,
                             &containerPtyPath,
                             0) < 0) {
695
            virReportSystemError(errno, "%s",
696
                                 _("Failed to allocate tty"));
697 698 699 700 701 702 703
            goto cleanup;
        }
    } else {
        VIR_DEBUG0("Opening tty on shared /dev/ptmx");
        if (virFileOpenTty(&containerPty,
                           &containerPtyPath,
                           0) < 0) {
704
            virReportSystemError(errno, "%s",
705
                                 _("Failed to allocate tty"));
706 707 708 709
            goto cleanup;
        }
    }

710 711
    if (lxcSetPersonality(def) < 0)
        goto cleanup;
712

713 714 715 716 717 718
    if ((container = lxcContainerStart(def,
                                       nveths,
                                       veths,
                                       control[1],
                                       containerPtyPath)) < 0)
        goto cleanup;
719
    VIR_FORCE_CLOSE(control[1]);
720 721 722 723 724 725 726

    if (lxcControllerMoveInterfaces(nveths, veths, container) < 0)
        goto cleanup;

    if (lxcContainerSendContinue(control[0]) < 0)
        goto cleanup;

D
Daniel P. Berrange 已提交
727 728 729 730 731
    /* Now the container is running, there's no need for us to keep
       any elevated capabilities */
    if (lxcControllerClearCapabilities() < 0)
        goto cleanup;

732
    rc = lxcControllerMain(monitor, client, appPty, containerPty, container);
733 734

cleanup:
735 736
    VIR_FREE(devptmx);
    VIR_FREE(devpts);
737 738
    VIR_FORCE_CLOSE(control[0]);
    VIR_FORCE_CLOSE(control[1]);
739
    VIR_FREE(containerPtyPath);
740
    VIR_FORCE_CLOSE(containerPty);
741

742
    if (container > 1) {
743
        int status;
744
        kill(container, SIGTERM);
745 746 747
        if (!(waitpid(container, &status, WNOHANG) == 0 &&
            WIFEXITED(status)))
            kill(container, SIGKILL);
748 749
        waitpid(container, NULL, 0);
    }
750 751 752 753
    return rc;
}


754
int main(int argc, char *argv[])
755 756
{
    pid_t pid;
757
    int rc = 1;
758
    int client;
759 760 761 762 763 764 765 766 767 768
    char *name = NULL;
    int nveths = 0;
    char **veths = NULL;
    int monitor = -1;
    int appPty = -1;
    int bg = 0;
    virCapsPtr caps = NULL;
    virDomainDefPtr def = NULL;
    char *configFile = NULL;
    char *sockpath = NULL;
769
    const struct option options[] = {
770 771 772 773 774 775 776
        { "background", 0, NULL, 'b' },
        { "name",   1, NULL, 'n' },
        { "veth",   1, NULL, 'v' },
        { "console", 1, NULL, 'c' },
        { "help", 0, NULL, 'h' },
        { 0, 0, 0, 0 },
    };
777

E
Eric Blake 已提交
778 779 780 781 782 783 784
    if (setlocale(LC_ALL, "") == NULL ||
        bindtextdomain(PACKAGE, LOCALEDIR) == NULL ||
        textdomain(PACKAGE) == NULL) {
        fprintf(stderr, _("%s: initialization failed\n"), argv[0]);
        exit(EXIT_FAILURE);
    }

785 786
    while (1) {
        int c;
787

788 789 790 791 792 793 794 795 796 797 798 799 800
        c = getopt_long(argc, argv, "dn:v:m:c:h",
                       options, NULL);

        if (c == -1)
            break;

        switch (c) {
        case 'b':
            bg = 1;
            break;

        case 'n':
            if ((name = strdup(optarg)) == NULL) {
801
                virReportOOMError();
802
                goto cleanup;
803
            }
804 805 806 807
            break;

        case 'v':
            if (VIR_REALLOC_N(veths, nveths+1) < 0) {
808
                virReportOOMError();
809
                goto cleanup;
810
            }
811
            if ((veths[nveths++] = strdup(optarg)) == NULL) {
812
                virReportOOMError();
813
                goto cleanup;
814
            }
815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837
            break;

        case 'c':
            if (virStrToLong_i(optarg, NULL, 10, &appPty) < 0) {
                fprintf(stderr, "malformed --console argument '%s'", optarg);
                goto cleanup;
            }
            break;

        case 'h':
        case '?':
            fprintf(stderr, "\n");
            fprintf(stderr, "syntax: %s [OPTIONS]\n", argv[0]);
            fprintf(stderr, "\n");
            fprintf(stderr, "Options\n");
            fprintf(stderr, "\n");
            fprintf(stderr, "  -b, --background\n");
            fprintf(stderr, "  -n NAME, --name NAME\n");
            fprintf(stderr, "  -c FD, --console FD\n");
            fprintf(stderr, "  -v VETH, --veth VETH\n");
            fprintf(stderr, "  -h, --help\n");
            fprintf(stderr, "\n");
            goto cleanup;
838 839 840 841
        }
    }


842 843 844 845 846 847 848 849 850 851
    if (name == NULL) {
        fprintf(stderr, "%s: missing --name argument for configuration\n", argv[0]);
        goto cleanup;
    }

    if (appPty < 0) {
        fprintf(stderr, "%s: missing --console argument for container PTY\n", argv[0]);
        goto cleanup;
    }

852
    if (getuid() != 0) {
853 854 855
        fprintf(stderr, "%s: must be run as the 'root' user\n", argv[0]);
        goto cleanup;
    }
856

857 858
    if ((caps = lxcCapsInit()) == NULL)
        goto cleanup;
859

860
    if ((configFile = virDomainConfigFile(LXC_STATE_DIR,
861 862
                                          name)) == NULL)
        goto cleanup;
863

864
    if ((def = virDomainDefParseFile(caps, configFile,
865
                                     VIR_DOMAIN_XML_INACTIVE)) == NULL)
866
        goto cleanup;
867

868
    if (def->nnets != nveths) {
869
        fprintf(stderr, "%s: expecting %d veths, but got %d\n",
870
                argv[0], def->nnets, nveths);
871
        goto cleanup;
872 873
    }

874 875
    if ((sockpath = lxcMonitorPath(def)) == NULL)
        goto cleanup;
876

877 878
    if ((monitor = lxcMonitorServer(sockpath)) < 0)
        goto cleanup;
879

880 881 882
    if (bg) {
        if ((pid = fork()) < 0)
            goto cleanup;
883

884 885
        if (pid > 0) {
            if ((rc = virFileWritePid(LXC_STATE_DIR, name, pid)) != 0) {
886
                virReportSystemError(rc,
887 888
                                     _("Unable to write pid file '%s/%s.pid'"),
                                     LXC_STATE_DIR, name);
889 890
                _exit(1);
            }
891

892 893 894 895
            /* First child now exits, allowing original caller
             * (ie libvirtd's LXC driver to complete their
             * waitpid & continue */
            _exit(0);
896 897
        }

898 899
        /* Don't hold onto any cwd we inherit from libvirtd either */
        if (chdir("/") < 0) {
900
            virReportSystemError(errno, "%s",
901
                                 _("Unable to change to root dir"));
902 903 904 905
            goto cleanup;
        }

        if (setsid() < 0) {
906
            virReportSystemError(errno, "%s",
907
                                 _("Unable to become session leader"));
908 909 910
            goto cleanup;
        }
    }
911

A
Amy Griffis 已提交
912 913 914
    /* Initialize logging */
    virLogSetFromEnv();

915
    /* Accept initial client which is the libvirtd daemon */
916
    if ((client = accept(monitor, NULL, 0)) < 0) {
917
        virReportSystemError(errno, "%s",
918
                             _("Failed to accept a connection from driver"));
919
        goto cleanup;
920 921
    }

922
    rc = lxcControllerRun(def, nveths, veths, monitor, client, appPty);
923 924


925
cleanup:
926 927
    if (def)
        virFileDeletePid(LXC_STATE_DIR, def->name);
928
    lxcControllerCleanupInterfaces(nveths, veths);
J
Jim Meyering 已提交
929 930
    if (sockpath)
        unlink(sockpath);
931 932 933 934
    VIR_FREE(sockpath);

    return rc;
}