lxc_controller.c 47.6 KB
Newer Older
1
/*
2
 * Copyright (C) 2010-2012 Red Hat, Inc.
3
 * Copyright IBM Corp. 2008
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *
 * lxc_controller.c: linux container process controller
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with this library.  If not, see
O
Osier Yang 已提交
22
 * <http://www.gnu.org/licenses/>.
23 24 25 26 27
 */

#include <config.h>

#include <sys/epoll.h>
28 29
#include <sys/wait.h>
#include <sys/socket.h>
30 31
#include <sys/types.h>
#include <sys/un.h>
32 33
#include <sys/utsname.h>
#include <sys/personality.h>
34
#include <unistd.h>
35
#include <paths.h>
36
#include <errno.h>
37 38
#include <fcntl.h>
#include <signal.h>
39
#include <getopt.h>
40
#include <sys/mount.h>
E
Eric Blake 已提交
41
#include <locale.h>
42 43
#include <grp.h>
#include <sys/stat.h>
44
#include <time.h>
45

D
Daniel P. Berrange 已提交
46
#if HAVE_CAPNG
47
# include <cap-ng.h>
D
Daniel P. Berrange 已提交
48 49
#endif

50 51 52 53 54
#if HAVE_NUMACTL
# define NUMA_VERSION1_COMPATIBILITY 1
# include <numa.h>
#endif

55
#include "virterror_internal.h"
56
#include "logging.h"
57 58 59
#include "util.h"

#include "lxc_conf.h"
60
#include "lxc_container.h"
61
#include "lxc_cgroup.h"
62
#include "lxc_protocol.h"
G
Gao feng 已提交
63
#include "lxc_fuse.h"
64 65
#include "virnetdev.h"
#include "virnetdevveth.h"
66 67
#include "memory.h"
#include "util.h"
E
Eric Blake 已提交
68
#include "virfile.h"
69
#include "virpidfile.h"
70
#include "command.h"
71 72
#include "processinfo.h"
#include "nodeinfo.h"
73
#include "virrandom.h"
74
#include "virprocess.h"
75
#include "rpc/virnetserver.h"
76

77 78
#define VIR_FROM_THIS VIR_FROM_LXC

79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
typedef struct _virLXCControllerConsole virLXCControllerConsole;
typedef virLXCControllerConsole *virLXCControllerConsolePtr;
struct _virLXCControllerConsole {
    int hostWatch;
    int hostFd;  /* PTY FD in the host OS */
    bool hostClosed;
    int hostEpoll;
    bool hostBlocking;

    int contWatch;
    int contFd;  /* PTY FD in the container */
    bool contClosed;
    int contEpoll;
    bool contBlocking;

    int epollWatch;
    int epollFd; /* epoll FD for dealing with EOF */

    size_t fromHostLen;
    char fromHostBuf[1024];
    size_t fromContLen;
    char fromContBuf[1024];
101 102

    virNetServerPtr server;
103 104
};

105 106 107 108 109
typedef struct _virLXCController virLXCController;
typedef virLXCController *virLXCControllerPtr;
struct _virLXCController {
    char *name;
    virDomainDefPtr def;
110

111 112
    int handshakeFd;

113 114
    pid_t initpid;

115 116
    size_t nveths;
    char **veths;
117 118 119

    size_t nconsoles;
    virLXCControllerConsolePtr consoles;
120
    char *devptmx;
121 122 123

    size_t nloopDevs;
    int *loopDevFds;
124 125

    virSecurityManagerPtr securityManager;
126

127 128
    /* Server socket */
    virNetServerPtr server;
129
    bool firstClient;
130 131 132 133
    virNetServerClientPtr client;
    virNetServerProgramPtr prog;
    bool inShutdown;
    int timerShutdown;
G
Gao feng 已提交
134 135

    virLXCFusePtr fuse;
136 137
};

138 139
#include "lxc_controller_dispatch.h"

140
static void virLXCControllerFree(virLXCControllerPtr ctrl);
141 142
static int virLXCControllerEventSendInit(virLXCControllerPtr ctrl,
                                         pid_t initpid);
143

144 145 146 147 148 149 150 151 152
static void virLXCControllerQuitTimer(int timer ATTRIBUTE_UNUSED, void *opaque)
{
    virLXCControllerPtr ctrl = opaque;

    VIR_DEBUG("Triggering event loop quit");
    virNetServerQuit(ctrl->server);
}


153 154 155 156 157 158 159 160 161
static virLXCControllerPtr virLXCControllerNew(const char *name)
{
    virLXCControllerPtr ctrl = NULL;
    virCapsPtr caps = NULL;
    char *configFile = NULL;

    if (VIR_ALLOC(ctrl) < 0)
        goto no_memory;

162
    ctrl->timerShutdown = -1;
163
    ctrl->firstClient = true;
164

165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
    if (!(ctrl->name = strdup(name)))
        goto no_memory;

    if ((caps = lxcCapsInit(NULL)) == NULL)
        goto error;

    if ((configFile = virDomainConfigFile(LXC_STATE_DIR,
                                          ctrl->name)) == NULL)
        goto error;

    if ((ctrl->def = virDomainDefParseFile(caps,
                                           configFile,
                                           1 << VIR_DOMAIN_VIRT_LXC,
                                           0)) == NULL)
        goto error;

181 182 183 184 185
    if ((ctrl->timerShutdown = virEventAddTimeout(-1,
                                                  virLXCControllerQuitTimer, ctrl,
                                                  NULL)) < 0)
        goto error;

186 187 188 189 190 191 192 193 194 195 196 197 198
cleanup:
    VIR_FREE(configFile);
    virCapabilitiesFree(caps);
    return ctrl;

no_memory:
    virReportOOMError();
error:
    virLXCControllerFree(ctrl);
    ctrl = NULL;
    goto cleanup;
}

199

200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
static int virLXCControllerCloseLoopDevices(virLXCControllerPtr ctrl,
                                            bool force)
{
    size_t i;

    for (i = 0 ; i < ctrl->nloopDevs ; i++) {
        if (force) {
            VIR_FORCE_CLOSE(ctrl->loopDevFds[i]);
        } else {
            if (VIR_CLOSE(ctrl->loopDevFds[i]) < 0) {
                virReportSystemError(errno, "%s",
                                     _("Unable to close loop device"));
                return -1;
            }
        }
    }

    return 0;
}


221 222 223 224 225
static void virLXCControllerStopInit(virLXCControllerPtr ctrl)
{
    if (ctrl->initpid == 0)
        return;

226
    virLXCControllerCloseLoopDevices(ctrl, true);
227
    virProcessAbort(ctrl->initpid);
228 229 230 231
    ctrl->initpid = 0;
}


232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
static void virLXCControllerConsoleClose(virLXCControllerConsolePtr console)
{
    if (console->hostWatch != -1)
        virEventRemoveHandle(console->hostWatch);
    VIR_FORCE_CLOSE(console->hostFd);

    if (console->contWatch != -1)
        virEventRemoveHandle(console->contWatch);
    VIR_FORCE_CLOSE(console->contFd);

    if (console->epollWatch != -1)
        virEventRemoveHandle(console->epollWatch);
    VIR_FORCE_CLOSE(console->epollFd);
}


G
Gao feng 已提交
248 249 250 251 252 253 254
static void
virLXCControllerFreeFuse(virLXCControllerPtr ctrl)
{
    return lxcFreeFuse(&ctrl->fuse);
}


255 256
static void virLXCControllerFree(virLXCControllerPtr ctrl)
{
257 258
    size_t i;

259 260 261
    if (!ctrl)
        return;

262 263
    virLXCControllerStopInit(ctrl);

264 265
    virSecurityManagerFree(ctrl->securityManager);

266 267 268 269
    for (i = 0 ; i < ctrl->nveths ; i++)
        VIR_FREE(ctrl->veths[i]);
    VIR_FREE(ctrl->veths);

270 271 272 273
    for (i = 0 ; i < ctrl->nconsoles ; i++)
        virLXCControllerConsoleClose(&(ctrl->consoles[i]));
    VIR_FREE(ctrl->consoles);

274 275
    VIR_FORCE_CLOSE(ctrl->handshakeFd);

276 277
    VIR_FREE(ctrl->devptmx);

278 279 280
    virDomainDefFree(ctrl->def);
    VIR_FREE(ctrl->name);

281 282 283
    if (ctrl->timerShutdown != -1)
        virEventRemoveTimeout(ctrl->timerShutdown);

284
    virObjectUnref(ctrl->server);
G
Gao feng 已提交
285
    virLXCControllerFreeFuse(ctrl);
286

287 288 289
    VIR_FREE(ctrl);
}

290

291 292 293 294 295 296 297
static int virLXCControllerAddConsole(virLXCControllerPtr ctrl,
                                      int hostFd)
{
    if (VIR_EXPAND_N(ctrl->consoles, ctrl->nconsoles, 1) < 0) {
        virReportOOMError();
        return -1;
    }
298
    ctrl->consoles[ctrl->nconsoles-1].server = ctrl->server;
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
    ctrl->consoles[ctrl->nconsoles-1].hostFd = hostFd;
    ctrl->consoles[ctrl->nconsoles-1].hostWatch = -1;

    ctrl->consoles[ctrl->nconsoles-1].contFd = -1;
    ctrl->consoles[ctrl->nconsoles-1].contWatch = -1;

    ctrl->consoles[ctrl->nconsoles-1].epollFd = -1;
    ctrl->consoles[ctrl->nconsoles-1].epollWatch = -1;
    return 0;
}


static int virLXCControllerConsoleSetNonblocking(virLXCControllerConsolePtr console)
{
    if (virSetBlocking(console->hostFd, false) < 0 ||
        virSetBlocking(console->contFd, false) < 0) {
        virReportSystemError(errno, "%s",
                             _("Unable to set console file descriptor non-blocking"));
        return -1;
    }

    return 0;
}


324 325 326 327 328 329 330 331 332 333 334 335
static int virLXCControllerDaemonHandshake(virLXCControllerPtr ctrl)
{
    if (lxcContainerSendContinue(ctrl->handshakeFd) < 0) {
        virReportSystemError(errno, "%s",
                             _("error sending continue signal to daemon"));
        return -1;
    }
    VIR_FORCE_CLOSE(ctrl->handshakeFd);
    return 0;
}


336 337 338
static int virLXCControllerValidateNICs(virLXCControllerPtr ctrl)
{
    if (ctrl->def->nnets != ctrl->nveths) {
339
        virReportError(VIR_ERR_INTERNAL_ERROR,
340
                       _("expecting %zu veths, but got %zu"),
341
                       ctrl->def->nnets, ctrl->nveths);
342 343 344 345 346 347 348
        return -1;
    }

    return 0;
}


349 350 351
static int virLXCControllerValidateConsoles(virLXCControllerPtr ctrl)
{
    if (ctrl->def->nconsoles != ctrl->nconsoles) {
352
        virReportError(VIR_ERR_INTERNAL_ERROR,
353
                       _("expecting %zu consoles, but got %zu tty file handlers"),
354
                       ctrl->def->nconsoles, ctrl->nconsoles);
355 356 357 358 359 360 361
        return -1;
    }

    return 0;
}


362
static int virLXCControllerSetupLoopDevice(virDomainFSDefPtr fs)
363
{
364
    int lofd;
365 366
    char *loname = NULL;

367
    if ((lofd = virFileLoopDeviceAssociate(fs->src, &loname)) < 0)
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
        return -1;

    /*
     * We now change it into a block device type, so that
     * the rest of container setup 'just works'
     */
    fs->type = VIR_DOMAIN_FS_TYPE_BLOCK;
    VIR_FREE(fs->src);
    fs->src = loname;
    loname = NULL;

    return lofd;
}


383
static int virLXCControllerSetupLoopDevices(virLXCControllerPtr ctrl)
384 385 386 387
{
    size_t i;
    int ret = -1;

388
    for (i = 0 ; i < ctrl->def->nfss ; i++) {
389 390
        int fd;

391
        if (ctrl->def->fss[i]->type != VIR_DOMAIN_FS_TYPE_FILE)
392 393
            continue;

394
        fd = virLXCControllerSetupLoopDevice(ctrl->def->fss[i]);
395 396 397 398
        if (fd < 0)
            goto cleanup;

        VIR_DEBUG("Saving loop fd %d", fd);
399
        if (VIR_EXPAND_N(ctrl->loopDevFds, ctrl->nloopDevs, 1) < 0) {
400 401 402 403
            VIR_FORCE_CLOSE(fd);
            virReportOOMError();
            goto cleanup;
        }
404
        ctrl->loopDevFds[ctrl->nloopDevs - 1] = fd;
405 406 407 408 409 410 411 412 413
    }

    VIR_DEBUG("Setup all loop devices");
    ret = 0;

cleanup:
    return ret;
}

414
#if HAVE_NUMACTL
415
static int virLXCControllerSetupNUMAPolicy(virLXCControllerPtr ctrl)
416 417 418 419 420 421 422 423 424
{
    nodemask_t mask;
    int mode = -1;
    int node = -1;
    int ret = -1;
    int i = 0;
    int maxnode = 0;
    bool warned = false;

425
    if (!ctrl->def->numatune.memory.nodemask)
426 427 428 429 430
        return 0;

    VIR_DEBUG("Setting NUMA memory policy");

    if (numa_available() < 0) {
431 432
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       "%s", _("Host kernel is not aware of NUMA."));
433 434 435 436 437 438 439
        return -1;
    }

    maxnode = numa_max_node() + 1;

    /* Convert nodemask to NUMA bitmask. */
    nodemask_zero(&mask);
440 441 442 443 444 445 446 447 448 449 450
    i = -1;
    while ((i = virBitmapNextSetBit(ctrl->def->numatune.memory.nodemask, i)) >= 0) {
        if (i > NUMA_NUM_NODES) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                           _("Host cannot support NUMA node %d"), i);
            return -1;
        }
        if (i > maxnode && !warned) {
            VIR_WARN("nodeset is out of range, there is only %d NUMA "
                     "nodes on host", maxnode);
            warned = true;
451
        }
452
        nodemask_set(&mask, i);
453 454
    }

455
    mode = ctrl->def->numatune.memory.mode;
456 457 458 459 460 461 462 463 464 465 466 467 468 469 470

    if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
        numa_set_bind_policy(1);
        numa_set_membind(&mask);
        numa_set_bind_policy(0);
    } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_PREFERRED) {
        int nnodes = 0;
        for (i = 0; i < NUMA_NUM_NODES; i++) {
            if (nodemask_isset(&mask, i)) {
                node = i;
                nnodes++;
            }
        }

        if (nnodes != 1) {
471 472 473
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                           "%s", _("NUMA memory tuning in 'preferred' mode "
                                   "only supports single node"));
474 475 476 477 478 479 480 481
            goto cleanup;
        }

        numa_set_bind_policy(0);
        numa_set_preferred(node);
    } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE) {
        numa_set_interleave_mask(&mask);
    } else {
482 483 484
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Unable to set NUMA policy %s"),
                       virDomainNumatuneMemModeTypeToString(mode));
485 486 487 488 489 490 491 492 493
        goto cleanup;
    }

    ret = 0;

cleanup:
    return ret;
}
#else
494
static int virLXCControllerSetupNUMAPolicy(virLXCControllerPtr ctrl)
495
{
496
    if (ctrl->def->numatune.memory.nodemask) {
497 498
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("NUMA policy is not available on this platform"));
499 500 501 502 503 504 505
        return -1;
    }

    return 0;
}
#endif

506 507 508 509

/*
 * To be run while still single threaded
 */
510
static int virLXCControllerSetupCpuAffinity(virLXCControllerPtr ctrl)
511
{
H
Hu Tao 已提交
512 513
    int hostcpus, maxcpu = CPU_SETSIZE;
    virBitmapPtr cpumap, cpumapToSet;
514 515 516 517 518

    VIR_DEBUG("Setting CPU affinity");

    /* setaffinity fails if you set bits for CPUs which
     * aren't present, so we have to limit ourselves */
519 520 521
    if ((hostcpus = nodeGetCPUCount()) < 0)
        return -1;

522 523 524
    if (maxcpu > hostcpus)
        maxcpu = hostcpus;

525 526
    cpumap = virBitmapNew(maxcpu);
    if (!cpumap)
527 528
        return -1;

H
Hu Tao 已提交
529 530
    cpumapToSet = cpumap;

531
    if (ctrl->def->cpumask) {
H
Hu Tao 已提交
532
        cpumapToSet = ctrl->def->cpumask;
533 534 535 536 537
    } else {
        /* You may think this is redundant, but we can't assume libvirtd
         * itself is running on all pCPUs, so we need to explicitly set
         * the spawned LXC instance to all pCPUs if no map is given in
         * its config file */
538
        virBitmapSetAll(cpumap);
539 540
    }

541
    /* We are presuming we are running between fork/exec of LXC
542 543 544
     * so use '0' to indicate our own process ID. No threads are
     * running at this point
     */
H
Hu Tao 已提交
545
    if (virProcessInfoSetAffinity(0 /* Self */, cpumapToSet) < 0) {
546
        virBitmapFree(cpumap);
547 548
        return -1;
    }
549
    virBitmapFree(cpumap);
550 551 552 553 554

    return 0;
}


555
/**
556 557
 * virLXCControllerSetupResourceLimits
 * @ctrl: the controller state
558 559 560 561 562 563
 *
 * Creates a cgroup for the container, moves the task inside,
 * and sets resource limits
 *
 * Returns 0 on success or -1 in case of error
 */
564
static int virLXCControllerSetupResourceLimits(virLXCControllerPtr ctrl)
565 566
{

567
    if (virLXCControllerSetupCpuAffinity(ctrl) < 0)
568 569
        return -1;

570
    if (virLXCControllerSetupNUMAPolicy(ctrl) < 0)
571 572
        return -1;

573
    return virLXCCgroupSetup(ctrl->def);
D
Dan Smith 已提交
574 575
}

576

577 578 579 580 581 582 583 584 585 586 587 588 589
static void virLXCControllerClientCloseHook(virNetServerClientPtr client)
{
    virLXCControllerPtr ctrl = virNetServerClientGetPrivateData(client);

    VIR_DEBUG("Client %p has closed", client);
    if (ctrl->client == client)
        ctrl->client = NULL;
    if (ctrl->inShutdown) {
        VIR_DEBUG("Arm timer to quit event loop");
        virEventUpdateTimeout(ctrl->timerShutdown, 0);
    }
}

590 591
static void virLXCControllerClientPrivateFree(void *data)
{
592 593
    virLXCControllerPtr ctrl = data;
    VIR_DEBUG("Got private data free %p", ctrl);
594 595 596 597
}

static void *virLXCControllerClientPrivateNew(virNetServerClientPtr client,
                                              void *opaque)
598 599
{
    virLXCControllerPtr ctrl = opaque;
600

601 602 603
    virNetServerClientSetCloseHook(client, virLXCControllerClientCloseHook);
    VIR_DEBUG("Got new client %p", client);
    ctrl->client = client;
604 605 606 607 608

    if (ctrl->initpid && ctrl->firstClient)
        virLXCControllerEventSendInit(ctrl, ctrl->initpid);
    ctrl->firstClient = false;

609
    return ctrl;
610 611
}

612 613

static int virLXCControllerSetupServer(virLXCControllerPtr ctrl)
614
{
615 616
    virNetServerServicePtr svc = NULL;
    char *sockpath;
617

618 619 620 621
    if (virAsprintf(&sockpath, "%s/%s.sock",
                    LXC_STATE_DIR, ctrl->name) < 0) {
        virReportOOMError();
        return -1;
622 623
    }

624 625 626
    if (!(ctrl->server = virNetServerNew(0, 0, 0, 1,
                                         -1, 0, false,
                                         NULL,
627
                                         virLXCControllerClientPrivateNew,
628
                                         NULL,
629
                                         virLXCControllerClientPrivateFree,
630
                                         ctrl)))
C
Chris Lalancette 已提交
631
        goto error;
632

633 634 635 636 637 638 639
    if (!(svc = virNetServerServiceNewUNIX(sockpath,
                                           0700,
                                           0,
                                           0,
                                           false,
                                           5,
                                           NULL)))
640
        goto error;
641 642

    if (virNetServerAddService(ctrl->server, svc, NULL) < 0)
643
        goto error;
644
    virObjectUnref(svc);
645
    svc = NULL;
646

647 648 649 650 651 652
    if (!(ctrl->prog = virNetServerProgramNew(VIR_LXC_PROTOCOL_PROGRAM,
                                              VIR_LXC_PROTOCOL_PROGRAM_VERSION,
                                              virLXCProtocolProcs,
                                              virLXCProtocolNProcs)))
        goto error;

653 654 655
    virNetServerUpdateServices(ctrl->server, true);
    VIR_FREE(sockpath);
    return 0;
656 657

error:
658
    VIR_FREE(sockpath);
659
    virObjectUnref(ctrl->server);
660
    ctrl->server = NULL;
661
    virObjectUnref(svc);
662 663
    return -1;
}
664

D
Daniel P. Berrange 已提交
665 666 667 668 669 670 671 672 673

static int lxcControllerClearCapabilities(void)
{
#if HAVE_CAPNG
    int ret;

    capng_clear(CAPNG_SELECT_BOTH);

    if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
674 675
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("failed to apply capabilities: %d"), ret);
D
Daniel P. Berrange 已提交
676 677 678
        return -1;
    }
#else
679
    VIR_WARN("libcap-ng support not compiled in, unable to clear capabilities");
D
Daniel P. Berrange 已提交
680 681 682 683
#endif
    return 0;
}

684
static bool wantReboot = false;
685 686 687
static virMutex lock;


688
static void virLXCControllerSignalChildIO(virNetServerPtr server,
689
                                          siginfo_t *info ATTRIBUTE_UNUSED,
690
                                          void *opaque)
691
{
692
    virLXCControllerPtr ctrl = opaque;
693
    int ret;
694
    int status;
695

696
    ret = waitpid(-1, &status, WNOHANG);
697
    if (ret == ctrl->initpid) {
698
        virNetServerQuit(server);
699
        virMutexLock(&lock);
700 701 702
        if (WIFSIGNALED(status) &&
            WTERMSIG(status) == SIGHUP)
            wantReboot = true;
703 704
        virMutexUnlock(&lock);
    }
705 706 707
}


708
static void virLXCControllerConsoleUpdateWatch(virLXCControllerConsolePtr console)
709 710 711 712
{
    int hostEvents = 0;
    int contEvents = 0;

713
    if (!console->hostClosed || (!console->hostBlocking && console->fromContLen)) {
714 715 716 717 718
        if (console->fromHostLen < sizeof(console->fromHostBuf))
            hostEvents |= VIR_EVENT_HANDLE_READABLE;
        if (console->fromContLen)
            hostEvents |= VIR_EVENT_HANDLE_WRITABLE;
    }
719
    if (!console->contClosed || (!console->contBlocking && console->fromHostLen)) {
720 721 722 723 724 725
        if (console->fromContLen < sizeof(console->fromContBuf))
            contEvents |= VIR_EVENT_HANDLE_READABLE;
        if (console->fromHostLen)
            contEvents |= VIR_EVENT_HANDLE_WRITABLE;
    }

726 727 728
    VIR_DEBUG("Container watch %d=%d host watch %d=%d",
              console->contWatch, contEvents,
              console->hostWatch, hostEvents);
729 730
    virEventUpdateHandle(console->contWatch, contEvents);
    virEventUpdateHandle(console->hostWatch, hostEvents);
731

732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750
    if (console->hostClosed) {
        int events = EPOLLIN | EPOLLET;
        if (console->hostBlocking)
            events |= EPOLLOUT;

        if (events != console->hostEpoll) {
            struct epoll_event event;
            int action = EPOLL_CTL_ADD;
            if (console->hostEpoll)
                action = EPOLL_CTL_MOD;

            VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->hostEpoll);

            event.events = events;
            event.data.fd = console->hostFd;
            if (epoll_ctl(console->epollFd, action, console->hostFd, &event) < 0) {
                VIR_DEBUG(":fail");
                virReportSystemError(errno, "%s",
                                     _("Unable to add epoll fd"));
751
                virNetServerQuit(console->server);
752 753 754 755 756 757 758 759 760 761
                goto cleanup;
            }
            console->hostEpoll = events;
            VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->hostEpoll);
        }
    } else if (console->hostEpoll) {
        VIR_DEBUG("Stop epoll oldContEvents=%x", console->hostEpoll);
        if (epoll_ctl(console->epollFd, EPOLL_CTL_DEL, console->hostFd, NULL) < 0) {
            virReportSystemError(errno, "%s",
                                 _("Unable to remove epoll fd"));
762 763
            VIR_DEBUG(":fail");
            virNetServerQuit(console->server);
764 765 766 767
            goto cleanup;
        }
        console->hostEpoll = 0;
    }
768

769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787
    if (console->contClosed) {
        int events = EPOLLIN | EPOLLET;
        if (console->contBlocking)
            events |= EPOLLOUT;

        if (events != console->contEpoll) {
            struct epoll_event event;
            int action = EPOLL_CTL_ADD;
            if (console->contEpoll)
                action = EPOLL_CTL_MOD;

            VIR_DEBUG("newContEvents=%x oldContEvents=%x", events, console->contEpoll);

            event.events = events;
            event.data.fd = console->contFd;
            if (epoll_ctl(console->epollFd, action, console->contFd, &event) < 0) {
                virReportSystemError(errno, "%s",
                                     _("Unable to add epoll fd"));
                VIR_DEBUG(":fail");
788
                virNetServerQuit(console->server);
789 790 791 792 793 794 795 796 797 798
                goto cleanup;
            }
            console->contEpoll = events;
            VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->contEpoll);
        }
    } else if (console->contEpoll) {
        VIR_DEBUG("Stop epoll oldContEvents=%x", console->contEpoll);
        if (epoll_ctl(console->epollFd, EPOLL_CTL_DEL, console->contFd, NULL) < 0) {
            virReportSystemError(errno, "%s",
                                 _("Unable to remove epoll fd"));
799 800
            VIR_DEBUG(":fail");
            virNetServerQuit(console->server);
801 802 803 804 805 806 807
            goto cleanup;
        }
        console->contEpoll = 0;
    }
cleanup:
    return;
}
808 809


810
static void virLXCControllerConsoleEPoll(int watch, int fd, int events, void *opaque)
811
{
812
    virLXCControllerConsolePtr console = opaque;
813

814 815 816 817 818 819 820 821 822 823
    virMutexLock(&lock);
    VIR_DEBUG("IO event watch=%d fd=%d events=%d fromHost=%zu fromcont=%zu",
              watch, fd, events,
              console->fromHostLen,
              console->fromContLen);

    while (1) {
        struct epoll_event event;
        int ret;
        ret = epoll_wait(console->epollFd, &event, 1, 0);
824
        if (ret < 0) {
S
Stefan Berger 已提交
825
            if (errno == EINTR)
826 827 828
                continue;
            virReportSystemError(errno, "%s",
                                 _("Unable to wait on epoll"));
829
            virNetServerQuit(console->server);
830 831 832
            goto cleanup;
        }

833 834 835 836 837 838 839
        if (ret == 0)
            break;

        VIR_DEBUG("fd=%d hostFd=%d contFd=%d hostEpoll=%x contEpoll=%x",
                  event.data.fd, console->hostFd, console->contFd,
                  console->hostEpoll, console->contEpoll);

840 841 842
        /* If we get HUP+dead PID, we just re-enable the main loop
         * which will see the PID has died and exit */
        if ((event.events & EPOLLIN)) {
843 844
            if (event.data.fd == console->hostFd) {
                console->hostClosed = false;
845
            } else {
846
                console->contClosed = false;
847
            }
848
            virLXCControllerConsoleUpdateWatch(console);
849 850 851 852 853
            break;
        }
    }

cleanup:
854
    virMutexUnlock(&lock);
855 856
}

857
static void virLXCControllerConsoleIO(int watch, int fd, int events, void *opaque)
858
{
859
    virLXCControllerConsolePtr console = opaque;
860 861

    virMutexLock(&lock);
862 863 864 865
    VIR_DEBUG("IO event watch=%d fd=%d events=%d fromHost=%zu fromcont=%zu",
              watch, fd, events,
              console->fromHostLen,
              console->fromContLen);
866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921
    if (events & VIR_EVENT_HANDLE_READABLE) {
        char *buf;
        size_t *len;
        size_t avail;
        ssize_t done;
        if (watch == console->hostWatch) {
            buf = console->fromHostBuf;
            len = &console->fromHostLen;
            avail = sizeof(console->fromHostBuf) - *len;
        } else {
            buf = console->fromContBuf;
            len = &console->fromContLen;
            avail = sizeof(console->fromContBuf) - *len;
        }
    reread:
        done = read(fd, buf + *len, avail);
        if (done == -1 && errno == EINTR)
            goto reread;
        if (done == -1 && errno != EAGAIN) {
            virReportSystemError(errno, "%s",
                                 _("Unable to read container pty"));
            goto error;
        }
        if (done > 0) {
            *len += done;
        } else {
            VIR_DEBUG("Read fd %d done %d errno %d", fd, (int)done, errno);
        }
    }

    if (events & VIR_EVENT_HANDLE_WRITABLE) {
        char *buf;
        size_t *len;
        ssize_t done;
        if (watch == console->hostWatch) {
            buf = console->fromContBuf;
            len = &console->fromContLen;
        } else {
            buf = console->fromHostBuf;
            len = &console->fromHostLen;
        }

    rewrite:
        done = write(fd, buf, *len);
        if (done == -1 && errno == EINTR)
            goto rewrite;
        if (done == -1 && errno != EAGAIN) {
            virReportSystemError(errno, "%s",
                                 _("Unable to write to container pty"));
            goto error;
        }
        if (done > 0) {
            memmove(buf, buf + done, (*len - done));
            *len -= done;
        } else {
            VIR_DEBUG("Write fd %d done %d errno %d", fd, (int)done, errno);
922 923 924 925
            if (watch == console->hostWatch)
                console->hostBlocking = true;
            else
                console->contBlocking = true;
926 927 928 929 930 931 932 933 934 935 936 937
        }
    }

    if (events & VIR_EVENT_HANDLE_HANGUP) {
        if (watch == console->hostWatch) {
            console->hostClosed = true;
        } else {
            console->contClosed = true;
        }
        VIR_DEBUG("Got EOF on %d %d", watch, fd);
    }

938
    virLXCControllerConsoleUpdateWatch(console);
939 940 941 942 943 944 945
    virMutexUnlock(&lock);
    return;

error:
    virEventRemoveHandle(console->contWatch);
    virEventRemoveHandle(console->hostWatch);
    console->contWatch = console->hostWatch = -1;
946
    virNetServerQuit(console->server);
947 948 949 950
    virMutexUnlock(&lock);
}


951
/**
952
 * lxcControllerMain
953 954
 * @serverFd: server socket fd to accept client requests
 * @clientFd: initial client which is the libvirtd daemon
955
 *
956
 * Processes I/O on consoles and the monitor
957 958 959
 *
 * Returns 0 on success or -1 in case of error
 */
960
static int virLXCControllerMain(virLXCControllerPtr ctrl)
961
{
962
    virErrorPtr err;
963
    int rc = -1;
964
    size_t i;
965 966 967 968

    if (virMutexInit(&lock) < 0)
        goto cleanup2;

969 970 971 972
    if (virNetServerAddSignalHandler(ctrl->server,
                                     SIGCHLD,
                                     virLXCControllerSignalChildIO,
                                     ctrl) < 0)
973 974
        goto cleanup;

975 976
    virResetLastError();

977 978
    for (i = 0 ; i < ctrl->nconsoles ; i++) {
        if ((ctrl->consoles[i].epollFd = epoll_create1(EPOLL_CLOEXEC)) < 0) {
979 980 981 982 983
            virReportSystemError(errno, "%s",
                                 _("Unable to create epoll fd"));
            goto cleanup;
        }

984 985 986 987 988
        if ((ctrl->consoles[i].epollWatch = virEventAddHandle(ctrl->consoles[i].epollFd,
                                                              VIR_EVENT_HANDLE_READABLE,
                                                              virLXCControllerConsoleEPoll,
                                                              &(ctrl->consoles[i]),
                                                              NULL)) < 0) {
989 990
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("Unable to watch epoll FD"));
991 992 993
            goto cleanup;
        }

994 995 996 997 998
        if ((ctrl->consoles[i].hostWatch = virEventAddHandle(ctrl->consoles[i].hostFd,
                                                             VIR_EVENT_HANDLE_READABLE,
                                                             virLXCControllerConsoleIO,
                                                             &(ctrl->consoles[i]),
                                                             NULL)) < 0) {
999 1000
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("Unable to watch host console PTY"));
1001 1002 1003
            goto cleanup;
        }

1004 1005 1006 1007 1008
        if ((ctrl->consoles[i].contWatch = virEventAddHandle(ctrl->consoles[i].contFd,
                                                             VIR_EVENT_HANDLE_READABLE,
                                                             virLXCControllerConsoleIO,
                                                             &(ctrl->consoles[i]),
                                                             NULL)) < 0) {
1009 1010
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("Unable to watch host console PTY"));
1011 1012
            goto cleanup;
        }
1013
    }
1014

1015
    virNetServerRun(ctrl->server);
1016

1017 1018
    err = virGetLastError();
    if (!err || err->code == VIR_ERR_OK)
1019
        rc = wantReboot ? 1 : 0;
1020 1021

cleanup:
1022 1023
    virMutexDestroy(&lock);
cleanup2:
1024

1025 1026
    for (i = 0 ; i < ctrl->nconsoles ; i++)
        virLXCControllerConsoleClose(&(ctrl->consoles[i]));
1027

1028 1029 1030
    return rc;
}

1031 1032 1033


/**
1034
 * virLXCControllerMoveInterfaces
1035 1036 1037 1038 1039 1040 1041 1042
 * @nveths: number of interfaces
 * @veths: interface names
 * @container: pid of container
 *
 * Moves network interfaces into a container's namespace
 *
 * Returns 0 on success or -1 in case of error
 */
1043
static int virLXCControllerMoveInterfaces(virLXCControllerPtr ctrl)
1044
{
1045 1046 1047
    size_t i;

    for (i = 0 ; i < ctrl->nveths ; i++) {
1048
        if (virNetDevSetNamespace(ctrl->veths[i], ctrl->initpid) < 0)
1049
            return -1;
1050
    }
1051 1052 1053 1054 1055 1056

    return 0;
}


/**
1057 1058
 * virLXCControllerDeleteInterfaces:
 * @ctrl: the LXC controller
1059 1060 1061 1062 1063
 *
 * Cleans up the container interfaces by deleting the veth device pairs.
 *
 * Returns 0 on success or -1 in case of error
 */
1064
static int virLXCControllerDeleteInterfaces(virLXCControllerPtr ctrl)
1065
{
1066 1067
    size_t i;
    int ret = 0;
1068

1069 1070 1071 1072 1073 1074
    for (i = 0 ; i < ctrl->nveths ; i++) {
        if (virNetDevVethDelete(ctrl->veths[i]) < 0)
            ret = -1;
    }

    return ret;
1075 1076
}

1077

1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
static int lxcSetPersonality(virDomainDefPtr def)
{
    struct utsname utsname;
    const char *altArch;

    uname(&utsname);

    altArch = lxcContainerGetAlt32bitArch(utsname.machine);
    if (altArch &&
        STREQ(def->os.arch, altArch)) {
        if (personality(PER_LINUX32) < 0) {
            virReportSystemError(errno, _("Unable to request personality for %s on %s"),
                                 altArch, utsname.machine);
            return -1;
        }
    }
    return 0;
}

1097
#ifndef MS_REC
1098
# define MS_REC          16384
1099 1100 1101
#endif

#ifndef MS_SLAVE
1102
# define MS_SLAVE              (1<<19)
1103
#endif
1104

1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148
/* Create a private tty using the private devpts at PTMX, returning
 * the master in *TTYMASTER and the name of the slave, _from the
 * perspective of the guest after remounting file systems_, in
 * *TTYNAME.  Heavily borrowed from glibc, but doesn't require that
 * devpts == "/dev/pts" */
static int
lxcCreateTty(char *ptmx, int *ttymaster, char **ttyName)
{
    int ret = -1;
    int ptyno;
    int unlock = 0;

    if ((*ttymaster = open(ptmx, O_RDWR|O_NOCTTY|O_NONBLOCK)) < 0)
        goto cleanup;

    if (ioctl(*ttymaster, TIOCSPTLCK, &unlock) < 0)
        goto cleanup;

    if (ioctl(*ttymaster, TIOCGPTN, &ptyno) < 0)
        goto cleanup;

    /* If mount() succeeded at honoring newinstance, then the kernel
     * was new enough to also honor the mode=0620,gid=5 options, which
     * guarantee that the new pty already has correct permissions; so
     * while glibc has to fstat(), fchmod(), and fchown() for older
     * kernels, we can skip those steps.  ptyno shouldn't currently be
     * anything other than 0, but let's play it safe.  */
    if (virAsprintf(ttyName, "/dev/pts/%d", ptyno) < 0) {
        virReportOOMError();
        errno = ENOMEM;
        goto cleanup;
    }

    ret = 0;

cleanup:
    if (ret != 0) {
        VIR_FORCE_CLOSE(*ttymaster);
        VIR_FREE(*ttyName);
    }

    return ret;
}

1149

1150
static int
1151
virLXCControllerSetupDevPTS(virLXCControllerPtr ctrl)
1152
{
1153
    virDomainFSDefPtr root = virDomainGetRootFilesystem(ctrl->def);
1154
    char *mount_options = NULL;
1155
    char *opts = NULL;
1156 1157
    char *devpts = NULL;
    int ret = -1;
1158

1159 1160
    if (!root) {
        if (ctrl->nconsoles != 1) {
1161 1162 1163
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                           _("Expected exactly one console, but got %zu"),
                           ctrl->nconsoles);
1164 1165 1166
            return -1;
        }
        return 0;
1167 1168
    }

1169
    VIR_DEBUG("Setting up private /dev/pts");
1170

1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
    /*
     * If doing a chroot style setup, we need to prepare
     * a private /dev/pts for the child now, which they
     * will later move into position.
     *
     * This is complex because 'virsh console' needs to
     * use /dev/pts from the host OS, and the guest OS
     * needs to use /dev/pts from the guest.
     *
     * This means that we (libvirt_lxc) need to see and
     * use both /dev/pts instances. We're running in the
     * host OS context though and don't want to expose
     * the guest OS /dev/pts there.
     *
     * Thus we call unshare(CLONE_NS) so that we can see
     * the guest's new /dev/pts, without it becoming
     * visible to the host OS. We also put the root FS
     * into slave mode, just in case it was currently
     * marked as shared
     */
1191 1192
    mount_options = virSecurityManagerGetMountOptions(ctrl->securityManager,
                                                      ctrl->def);
1193

1194 1195 1196 1197 1198 1199
    if (!virFileExists(root->src)) {
        virReportSystemError(errno,
                             _("root source %s does not exist"),
                             root->src);
        goto cleanup;
    }
1200

1201 1202 1203 1204 1205
    if (unshare(CLONE_NEWNS) < 0) {
        virReportSystemError(errno, "%s",
                             _("Cannot unshare mount namespace"));
        goto cleanup;
    }
1206

1207 1208 1209 1210 1211
    if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
        virReportSystemError(errno, "%s",
                             _("Failed to switch root mount into slave mode"));
        goto cleanup;
    }
1212

1213 1214 1215 1216 1217
    if (virAsprintf(&devpts, "%s/dev/pts", root->src) < 0 ||
        virAsprintf(&ctrl->devptmx, "%s/dev/pts/ptmx", root->src) < 0) {
        virReportOOMError();
        goto cleanup;
    }
1218

1219 1220 1221 1222 1223 1224
    if (virFileMakePath(devpts) < 0) {
        virReportSystemError(errno,
                             _("Failed to make path %s"),
                             devpts);
        goto cleanup;
    }
1225

1226 1227 1228 1229 1230 1231 1232
    /* XXX should we support gid=X for X!=5 for distros which use
     * a different gid for tty?  */
    if (virAsprintf(&opts, "newinstance,ptmxmode=0666,mode=0620,gid=5%s",
                    (mount_options ? mount_options : "")) < 0) {
        virReportOOMError();
        goto cleanup;
    }
1233

1234 1235 1236 1237 1238 1239 1240 1241
    VIR_DEBUG("Mount devpts on %s type=tmpfs flags=%x, opts=%s",
              devpts, MS_NOSUID, opts);
    if (mount("devpts", devpts, "devpts", MS_NOSUID, opts) < 0) {
        virReportSystemError(errno,
                             _("Failed to mount devpts on %s"),
                             devpts);
        goto cleanup;
    }
1242

1243 1244 1245
    if (access(ctrl->devptmx, R_OK) < 0) {
        VIR_WARN("Kernel does not support private devpts, using shared devpts");
        VIR_FREE(ctrl->devptmx);
1246 1247
    }

1248 1249 1250 1251 1252 1253 1254 1255 1256
    ret = 0;

cleanup:
    VIR_FREE(opts);
    VIR_FREE(devpts);
    return ret;
}


G
Gao feng 已提交
1257 1258 1259 1260 1261 1262
static int
virLXCControllerSetupFuse(virLXCControllerPtr ctrl)
{
    return lxcSetupFuse(&ctrl->fuse, ctrl->def);
}

1263 1264 1265 1266 1267 1268
static int
virLXCControllerSetupConsoles(virLXCControllerPtr ctrl,
                              char **containerTTYPaths)
{
    size_t i;

1269
    for (i = 0 ; i < ctrl->nconsoles ; i++) {
1270 1271 1272
        if (ctrl->devptmx) {
            VIR_DEBUG("Opening tty on private %s", ctrl->devptmx);
            if (lxcCreateTty(ctrl->devptmx,
1273 1274
                             &ctrl->consoles[i].contFd,
                             &containerTTYPaths[i]) < 0) {
1275 1276
                virReportSystemError(errno, "%s",
                                     _("Failed to allocate tty"));
1277
                return -1;
1278 1279 1280
            }
        } else {
            VIR_DEBUG("Opening tty on shared /dev/ptmx");
1281 1282
            if (virFileOpenTty(&ctrl->consoles[i].contFd,
                               &containerTTYPaths[i],
1283 1284 1285
                               0) < 0) {
                virReportSystemError(errno, "%s",
                                     _("Failed to allocate tty"));
1286
                return -1;
1287 1288 1289
            }
        }
    }
1290 1291 1292 1293
    return 0;
}


1294 1295 1296 1297 1298 1299 1300 1301
static void
virLXCControllerEventSend(virLXCControllerPtr ctrl,
                          int procnr,
                          xdrproc_t proc,
                          void *data)
{
    virNetMessagePtr msg;

1302 1303
    if (!ctrl->client) {
        VIR_WARN("Dropping event %d becuase libvirtd is not connected", procnr);
1304
        return;
1305
    }
1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341

    VIR_DEBUG("Send event %d client=%p", procnr, ctrl->client);
    if (!(msg = virNetMessageNew(false)))
        goto error;

    msg->header.prog = virNetServerProgramGetID(ctrl->prog);
    msg->header.vers = virNetServerProgramGetVersion(ctrl->prog);
    msg->header.proc = procnr;
    msg->header.type = VIR_NET_MESSAGE;
    msg->header.serial = 1;
    msg->header.status = VIR_NET_OK;

    if (virNetMessageEncodeHeader(msg) < 0)
        goto error;

    if (virNetMessageEncodePayload(msg, proc, data) < 0)
        goto error;

    VIR_DEBUG("Queue event %d %zu", procnr, msg->bufferLength);
    virNetServerClientSendMessage(ctrl->client, msg);

    xdr_free(proc, data);
    return;

error:
    virNetMessageFree(msg);
    xdr_free(proc, data);
}


static int
virLXCControllerEventSendExit(virLXCControllerPtr ctrl,
                              int exitstatus)
{
    virLXCProtocolExitEventMsg msg;

1342
    VIR_DEBUG("Exit status %d (client=%p)", exitstatus, ctrl->client);
1343 1344 1345 1346 1347
    memset(&msg, 0, sizeof(msg));
    switch (exitstatus) {
    case 0:
        msg.status = VIR_LXC_PROTOCOL_EXIT_STATUS_SHUTDOWN;
        break;
1348 1349 1350
    case 1:
        msg.status = VIR_LXC_PROTOCOL_EXIT_STATUS_REBOOT;
        break;
1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371
    default:
        msg.status = VIR_LXC_PROTOCOL_EXIT_STATUS_ERROR;
        break;
    }

    virLXCControllerEventSend(ctrl,
                              VIR_LXC_PROTOCOL_PROC_EXIT_EVENT,
                              (xdrproc_t)xdr_virLXCProtocolExitEventMsg,
                              (void*)&msg);

    if (ctrl->client) {
        VIR_DEBUG("Waiting for client to complete dispatch");
        ctrl->inShutdown = true;
        virNetServerClientDelayedClose(ctrl->client);
        virNetServerRun(ctrl->server);
    }
    VIR_DEBUG("Client has gone away");
    return 0;
}


1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389
static int
virLXCControllerEventSendInit(virLXCControllerPtr ctrl,
                              pid_t initpid)
{
    virLXCProtocolInitEventMsg msg;

    VIR_DEBUG("Init pid %llu", (unsigned long long)initpid);
    memset(&msg, 0, sizeof(msg));
    msg.initpid = initpid;

    virLXCControllerEventSend(ctrl,
                              VIR_LXC_PROTOCOL_PROC_INIT_EVENT,
                              (xdrproc_t)xdr_virLXCProtocolInitEventMsg,
                              (void*)&msg);
    return 0;
}


1390
static int
1391
virLXCControllerRun(virLXCControllerPtr ctrl)
1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418
{
    int rc = -1;
    int control[2] = { -1, -1};
    int containerhandshake[2] = { -1, -1 };
    char **containerTTYPaths = NULL;
    size_t i;

    if (VIR_ALLOC_N(containerTTYPaths, ctrl->nconsoles) < 0) {
        virReportOOMError();
        goto cleanup;
    }

    if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) {
        virReportSystemError(errno, "%s",
                             _("sockpair failed"));
        goto cleanup;
    }

    if (socketpair(PF_UNIX, SOCK_STREAM, 0, containerhandshake) < 0) {
        virReportSystemError(errno, "%s",
                             _("socketpair failed"));
        goto cleanup;
    }

    if (virLXCControllerSetupLoopDevices(ctrl) < 0)
        goto cleanup;

1419
    if (virLXCControllerSetupResourceLimits(ctrl) < 0)
1420 1421 1422 1423 1424
        goto cleanup;

    if (virLXCControllerSetupDevPTS(ctrl) < 0)
        goto cleanup;

G
Gao feng 已提交
1425 1426 1427
    if (virLXCControllerSetupFuse(ctrl) < 0)
        goto cleanup;

1428 1429
    if (virLXCControllerSetupConsoles(ctrl, containerTTYPaths) < 0)
        goto cleanup;
1430

1431
    if (lxcSetPersonality(ctrl->def) < 0)
1432
        goto cleanup;
1433

1434
    if ((ctrl->initpid = lxcContainerStart(ctrl->def,
1435
                                           ctrl->securityManager,
1436 1437 1438 1439
                                           ctrl->nveths,
                                           ctrl->veths,
                                           control[1],
                                           containerhandshake[1],
1440 1441
                                           containerTTYPaths,
                                           ctrl->nconsoles)) < 0)
1442
        goto cleanup;
1443
    VIR_FORCE_CLOSE(control[1]);
1444
    VIR_FORCE_CLOSE(containerhandshake[1]);
1445

1446
    if (virLXCControllerMoveInterfaces(ctrl) < 0)
1447 1448
        goto cleanup;

1449 1450 1451
    if (lxcContainerSendContinue(control[0]) < 0) {
        virReportSystemError(errno, "%s",
                             _("Unable to send container continue message"));
1452
        goto cleanup;
1453
    }
1454

1455 1456 1457 1458 1459 1460
    if (lxcContainerWaitForContinue(containerhandshake[0]) < 0) {
        virReportSystemError(errno, "%s",
                             _("error receiving signal from container"));
        goto cleanup;
    }

1461 1462 1463
    /* Now the container is fully setup... */

    /* ...we can close the loop devices... */
1464 1465
    if (virLXCControllerCloseLoopDevices(ctrl, false) < 0)
        goto cleanup;
1466 1467

    /* ...and reduce our privileges */
D
Daniel P. Berrange 已提交
1468 1469 1470
    if (lxcControllerClearCapabilities() < 0)
        goto cleanup;

1471
    if (virLXCControllerDaemonHandshake(ctrl) < 0)
1472 1473
        goto cleanup;

1474 1475
    for (i = 0 ; i < ctrl->nconsoles ; i++)
        if (virLXCControllerConsoleSetNonblocking(&(ctrl->consoles[i])) < 0)
1476
            goto cleanup;
1477

1478
    rc = virLXCControllerMain(ctrl);
1479

1480 1481
    virLXCControllerEventSendExit(ctrl, rc);

1482
cleanup:
1483 1484
    VIR_FORCE_CLOSE(control[0]);
    VIR_FORCE_CLOSE(control[1]);
1485 1486
    VIR_FORCE_CLOSE(containerhandshake[0]);
    VIR_FORCE_CLOSE(containerhandshake[1]);
1487

1488 1489 1490
    for (i = 0 ; i < ctrl->nconsoles ; i++)
        VIR_FREE(containerTTYPaths[i]);
    VIR_FREE(containerTTYPaths);
1491

1492
    virLXCControllerStopInit(ctrl);
1493

1494 1495 1496 1497
    return rc;
}


1498
int main(int argc, char *argv[])
1499 1500
{
    pid_t pid;
1501 1502
    int rc = 1;
    char *name = NULL;
1503
    size_t nveths = 0;
1504
    char **veths = NULL;
1505
    int handshakeFd = -1;
1506
    int bg = 0;
1507
    const struct option options[] = {
1508 1509 1510 1511
        { "background", 0, NULL, 'b' },
        { "name",   1, NULL, 'n' },
        { "veth",   1, NULL, 'v' },
        { "console", 1, NULL, 'c' },
1512
        { "handshakefd", 1, NULL, 's' },
1513
        { "security", 1, NULL, 'S' },
1514 1515 1516
        { "help", 0, NULL, 'h' },
        { 0, 0, 0, 0 },
    };
1517 1518
    int *ttyFDs = NULL;
    size_t nttyFDs = 0;
1519
    virLXCControllerPtr ctrl = NULL;
1520
    size_t i;
1521
    const char *securityDriver = "none";
1522

E
Eric Blake 已提交
1523 1524
    if (setlocale(LC_ALL, "") == NULL ||
        bindtextdomain(PACKAGE, LOCALEDIR) == NULL ||
1525
        textdomain(PACKAGE) == NULL) {
E
Eric Blake 已提交
1526 1527 1528 1529
        fprintf(stderr, _("%s: initialization failed\n"), argv[0]);
        exit(EXIT_FAILURE);
    }

1530 1531 1532
    /* Initialize logging */
    virLogSetFromEnv();

1533 1534
    while (1) {
        int c;
1535

1536
        c = getopt_long(argc, argv, "dn:v:m:c:s:h:S:",
1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548
                       options, NULL);

        if (c == -1)
            break;

        switch (c) {
        case 'b':
            bg = 1;
            break;

        case 'n':
            if ((name = strdup(optarg)) == NULL) {
1549
                virReportOOMError();
1550
                goto cleanup;
1551
            }
1552 1553 1554 1555
            break;

        case 'v':
            if (VIR_REALLOC_N(veths, nveths+1) < 0) {
1556
                virReportOOMError();
1557
                goto cleanup;
1558
            }
1559
            if ((veths[nveths++] = strdup(optarg)) == NULL) {
1560
                virReportOOMError();
1561
                goto cleanup;
1562
            }
1563 1564 1565
            break;

        case 'c':
1566 1567 1568 1569 1570
            if (VIR_REALLOC_N(ttyFDs, nttyFDs + 1) < 0) {
                virReportOOMError();
                goto cleanup;
            }
            if (virStrToLong_i(optarg, NULL, 10, &ttyFDs[nttyFDs++]) < 0) {
1571 1572 1573 1574 1575
                fprintf(stderr, "malformed --console argument '%s'", optarg);
                goto cleanup;
            }
            break;

1576
        case 's':
1577
            if (virStrToLong_i(optarg, NULL, 10, &handshakeFd) < 0) {
1578 1579 1580 1581 1582 1583
                fprintf(stderr, "malformed --handshakefd argument '%s'",
                        optarg);
                goto cleanup;
            }
            break;

1584
        case 'S':
1585
            securityDriver = optarg;
1586 1587
            break;

1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598
        case 'h':
        case '?':
            fprintf(stderr, "\n");
            fprintf(stderr, "syntax: %s [OPTIONS]\n", argv[0]);
            fprintf(stderr, "\n");
            fprintf(stderr, "Options\n");
            fprintf(stderr, "\n");
            fprintf(stderr, "  -b, --background\n");
            fprintf(stderr, "  -n NAME, --name NAME\n");
            fprintf(stderr, "  -c FD, --console FD\n");
            fprintf(stderr, "  -v VETH, --veth VETH\n");
1599
            fprintf(stderr, "  -s FD, --handshakefd FD\n");
1600
            fprintf(stderr, "  -S NAME, --security NAME\n");
1601 1602 1603
            fprintf(stderr, "  -h, --help\n");
            fprintf(stderr, "\n");
            goto cleanup;
1604 1605 1606
        }
    }

1607 1608 1609 1610 1611
    if (name == NULL) {
        fprintf(stderr, "%s: missing --name argument for configuration\n", argv[0]);
        goto cleanup;
    }

1612
    if (handshakeFd < 0) {
1613 1614 1615 1616 1617
        fprintf(stderr, "%s: missing --handshake argument for container PTY\n",
                argv[0]);
        goto cleanup;
    }

1618
    if (getuid() != 0) {
1619 1620 1621
        fprintf(stderr, "%s: must be run as the 'root' user\n", argv[0]);
        goto cleanup;
    }
1622

1623 1624
    virEventRegisterDefaultImpl();

1625
    if (!(ctrl = virLXCControllerNew(name)))
1626
        goto cleanup;
1627

1628 1629
    ctrl->handshakeFd = handshakeFd;

1630 1631 1632 1633 1634
    if (!(ctrl->securityManager = virSecurityManagerNew(securityDriver,
                                                        LXC_DRIVER_NAME,
                                                        false, false, false)))
        goto cleanup;

1635 1636 1637 1638 1639 1640 1641 1642 1643
    if (ctrl->def->seclabels) {
        VIR_DEBUG("Security model %s type %s label %s imagelabel %s",
                  NULLSTR(ctrl->def->seclabels[0]->model),
                  virDomainSeclabelTypeToString(ctrl->def->seclabels[0]->type),
                  NULLSTR(ctrl->def->seclabels[0]->label),
                  NULLSTR(ctrl->def->seclabels[0]->imagelabel));
    } else {
        VIR_DEBUG("Security model not initialized");
    }
1644

1645 1646 1647
    ctrl->veths = veths;
    ctrl->nveths = nveths;

1648 1649 1650 1651 1652 1653
    for (i = 0 ; i < nttyFDs ; i++) {
        if (virLXCControllerAddConsole(ctrl, ttyFDs[i]) < 0)
            goto cleanup;
        ttyFDs[i] = -1;
    }

1654
    if (virLXCControllerValidateNICs(ctrl) < 0)
1655
        goto cleanup;
1656

1657 1658 1659
    if (virLXCControllerValidateConsoles(ctrl) < 0)
        goto cleanup;

1660
    if (virLXCControllerSetupServer(ctrl) < 0)
1661
        goto cleanup;
1662

1663 1664 1665
    if (bg) {
        if ((pid = fork()) < 0)
            goto cleanup;
1666

1667
        if (pid > 0) {
1668
            if ((rc = virPidFileWrite(LXC_STATE_DIR, name, pid)) < 0) {
1669
                virReportSystemError(-rc,
1670 1671
                                     _("Unable to write pid file '%s/%s.pid'"),
                                     LXC_STATE_DIR, name);
1672 1673
                _exit(1);
            }
1674

1675 1676 1677 1678
            /* First child now exits, allowing original caller
             * (ie libvirtd's LXC driver to complete their
             * waitpid & continue */
            _exit(0);
1679 1680
        }

1681 1682
        /* Don't hold onto any cwd we inherit from libvirtd either */
        if (chdir("/") < 0) {
1683
            virReportSystemError(errno, "%s",
1684
                                 _("Unable to change to root dir"));
1685 1686 1687 1688
            goto cleanup;
        }

        if (setsid() < 0) {
1689
            virReportSystemError(errno, "%s",
1690
                                 _("Unable to become session leader"));
1691 1692 1693
            goto cleanup;
        }
    }
1694

1695
    rc = virLXCControllerRun(ctrl);
1696

1697
cleanup:
1698
    virPidFileDelete(LXC_STATE_DIR, name);
1699 1700
    if (ctrl)
        virLXCControllerDeleteInterfaces(ctrl);
1701 1702 1703 1704
    for (i = 0 ; i < nttyFDs ; i++)
        VIR_FORCE_CLOSE(ttyFDs[i]);
    VIR_FREE(ttyFDs);

1705
    virLXCControllerFree(ctrl);
1706

1707
    return rc < 0? EXIT_FAILURE : EXIT_SUCCESS;
1708
}