lxc_controller.c 77.9 KB
Newer Older
1
/*
2
 * Copyright (C) 2010-2016 Red Hat, Inc.
3
 * Copyright IBM Corp. 2008
4 5 6 7 8 9 10 11 12 13 14 15 16 17
 *
 * lxc_controller.c: linux container process controller
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24
 */

#include <config.h>

#include <sys/epoll.h>
25
#include <sys/wait.h>
26

27
#ifdef __linux__
28 29 30
# include <sys/sysmacros.h>
#endif

31
#include <sys/personality.h>
32
#include <unistd.h>
33 34
#include <fcntl.h>
#include <signal.h>
35
#include <getopt.h>
36
#include <sys/mount.h>
37 38
#include <grp.h>
#include <sys/stat.h>
39
#include <time.h>
40

41
#if WITH_CAPNG
42
# include <cap-ng.h>
D
Daniel P. Berrange 已提交
43 44
#endif

45
#include "virerror.h"
46
#include "virlog.h"
47 48

#include "lxc_conf.h"
49
#include "lxc_container.h"
50
#include "lxc_cgroup.h"
51
#include "lxc_monitor_protocol.h"
G
Gao feng 已提交
52
#include "lxc_fuse.h"
53 54
#include "virnetdev.h"
#include "virnetdevveth.h"
55
#include "viralloc.h"
E
Eric Blake 已提交
56
#include "virfile.h"
57
#include "virpidfile.h"
58
#include "vircommand.h"
59
#include "virhostcpu.h"
60
#include "virrandom.h"
61
#include "virprocess.h"
62
#include "virnuma.h"
63
#include "virdbus.h"
64
#include "rpc/virnetdaemon.h"
65
#include "virstring.h"
66
#include "virgettext.h"
67
#include "virsocket.h"
J
Ján Tomko 已提交
68
#include "virutil.h"
69

70 71
#define VIR_FROM_THIS VIR_FROM_LXC

72 73
VIR_LOG_INIT("lxc.lxc_controller");

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
typedef struct _virLXCControllerConsole virLXCControllerConsole;
typedef virLXCControllerConsole *virLXCControllerConsolePtr;
struct _virLXCControllerConsole {
    int hostWatch;
    int hostFd;  /* PTY FD in the host OS */
    bool hostClosed;
    int hostEpoll;

    int contWatch;
    int contFd;  /* PTY FD in the container */
    bool contClosed;
    int contEpoll;

    int epollWatch;
    int epollFd; /* epoll FD for dealing with EOF */

    size_t fromHostLen;
    char fromHostBuf[1024];
    size_t fromContLen;
    char fromContBuf[1024];
94

95
    virNetDaemonPtr daemon;
96 97
};

98 99 100 101
typedef struct _virLXCController virLXCController;
typedef virLXCController *virLXCControllerPtr;
struct _virLXCController {
    char *name;
102
    virDomainObjPtr vm;
103
    virDomainDefPtr def;
104

105 106
    int handshakeFd;

107 108
    pid_t initpid;

109 110 111
    size_t nnbdpids;
    pid_t *nbdpids;

112 113
    size_t nveths;
    char **veths;
114

115 116 117
    size_t nnicindexes;
    int *nicindexes;

118 119 120
    size_t npassFDs;
    int *passFDs;

I
ik.nitk 已提交
121 122
    int *nsFDs;

123 124
    size_t nconsoles;
    virLXCControllerConsolePtr consoles;
125
    char *devptmx;
126 127 128

    size_t nloopDevs;
    int *loopDevFds;
129 130

    virSecurityManagerPtr securityManager;
131

132
    virNetDaemonPtr daemon;
133
    bool firstClient;
134 135 136 137
    virNetServerClientPtr client;
    virNetServerProgramPtr prog;
    bool inShutdown;
    int timerShutdown;
G
Gao feng 已提交
138

139 140
    virCgroupPtr cgroup;

G
Gao feng 已提交
141
    virLXCFusePtr fuse;
142 143
};

144 145
#include "lxc_controller_dispatch.h"

146
static void virLXCControllerFree(virLXCControllerPtr ctrl);
147 148
static int virLXCControllerEventSendInit(virLXCControllerPtr ctrl,
                                         pid_t initpid);
149

J
Ján Tomko 已提交
150
static void virLXCControllerQuitTimer(int timer G_GNUC_UNUSED, void *opaque)
151 152 153 154
{
    virLXCControllerPtr ctrl = opaque;

    VIR_DEBUG("Triggering event loop quit");
155
    virNetDaemonQuit(ctrl->daemon);
156 157 158
}


159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
static virLXCDriverPtr
virLXCControllerDriverNew(void)
{
    virLXCDriverPtr driver = g_new0(virLXCDriver, 1);

    if (virMutexInit(&driver->lock) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       "%s", _("cannot initialize mutex"));
        g_free(driver);
        return NULL;
    }

    driver->caps = virLXCDriverCapsInit(NULL);
    driver->xmlopt = lxcDomainXMLConfInit(driver);

    return driver;
}


static void
virLXCControllerDriverFree(virLXCDriverPtr driver)
{
    if (!driver)
        return;
    virObjectUnref(driver->xmlopt);
    virObjectUnref(driver->caps);
    virMutexDestroy(&driver->lock);
    g_free(driver);
}


190 191 192
static virLXCControllerPtr virLXCControllerNew(const char *name)
{
    virLXCControllerPtr ctrl = NULL;
193
    virLXCDriverPtr driver = NULL;
194 195
    char *configFile = NULL;

196
    if (VIR_ALLOC(ctrl) < 0)
197
        goto error;
198

199
    ctrl->timerShutdown = -1;
200
    ctrl->firstClient = true;
201

202
    ctrl->name = g_strdup(name);
203

204
    if (!(driver = virLXCControllerDriverNew()))
205 206
        goto error;

207 208 209 210
    if ((configFile = virDomainConfigFile(LXC_STATE_DIR,
                                          ctrl->name)) == NULL)
        goto error;

211
    if ((ctrl->vm = virDomainObjParseFile(configFile,
212
                                          driver->xmlopt,
213
                                          0)) == NULL)
214
        goto error;
215
    ctrl->def = ctrl->vm->def;
216

217 218 219 220 221
    if ((ctrl->timerShutdown = virEventAddTimeout(-1,
                                                  virLXCControllerQuitTimer, ctrl,
                                                  NULL)) < 0)
        goto error;

222
 cleanup:
223
    VIR_FREE(configFile);
224
    virLXCControllerDriverFree(driver);
225 226
    return ctrl;

227
 error:
228 229 230 231 232
    virLXCControllerFree(ctrl);
    ctrl = NULL;
    goto cleanup;
}

233

234
static int virLXCControllerCloseLoopDevices(virLXCControllerPtr ctrl)
235 236 237
{
    size_t i;

238
    for (i = 0; i < ctrl->nloopDevs; i++)
239
        VIR_FORCE_CLOSE(ctrl->loopDevFds[i]);
240 241 242 243 244

    return 0;
}


245 246 247 248 249
static void virLXCControllerStopInit(virLXCControllerPtr ctrl)
{
    if (ctrl->initpid == 0)
        return;

250
    virLXCControllerCloseLoopDevices(ctrl);
251
    virProcessAbort(ctrl->initpid);
252 253 254 255
    ctrl->initpid = 0;
}


256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
static void virLXCControllerConsoleClose(virLXCControllerConsolePtr console)
{
    if (console->hostWatch != -1)
        virEventRemoveHandle(console->hostWatch);
    VIR_FORCE_CLOSE(console->hostFd);

    if (console->contWatch != -1)
        virEventRemoveHandle(console->contWatch);
    VIR_FORCE_CLOSE(console->contFd);

    if (console->epollWatch != -1)
        virEventRemoveHandle(console->epollWatch);
    VIR_FORCE_CLOSE(console->epollFd);
}


G
Gao feng 已提交
272 273 274 275 276 277 278
static void
virLXCControllerFreeFuse(virLXCControllerPtr ctrl)
{
    return lxcFreeFuse(&ctrl->fuse);
}


279 280
static void virLXCControllerFree(virLXCControllerPtr ctrl)
{
281 282
    size_t i;

283 284 285
    if (!ctrl)
        return;

286 287
    virLXCControllerStopInit(ctrl);

288
    virObjectUnref(ctrl->securityManager);
289

290
    for (i = 0; i < ctrl->nveths; i++)
291 292
        VIR_FREE(ctrl->veths[i]);
    VIR_FREE(ctrl->veths);
293
    VIR_FREE(ctrl->nicindexes);
294

295 296 297 298
    for (i = 0; i < ctrl->npassFDs; i++)
        VIR_FORCE_CLOSE(ctrl->passFDs[i]);
    VIR_FREE(ctrl->passFDs);

299
    for (i = 0; i < ctrl->nconsoles; i++)
300 301 302
        virLXCControllerConsoleClose(&(ctrl->consoles[i]));
    VIR_FREE(ctrl->consoles);

303 304
    VIR_FREE(ctrl->devptmx);

305
    virDomainObjEndAPI(&ctrl->vm);
306 307
    VIR_FREE(ctrl->name);

308 309 310
    if (ctrl->timerShutdown != -1)
        virEventRemoveTimeout(ctrl->timerShutdown);

311
    virObjectUnref(ctrl->daemon);
G
Gao feng 已提交
312
    virLXCControllerFreeFuse(ctrl);
313

314 315
    VIR_FREE(ctrl->nbdpids);

I
ik.nitk 已提交
316
    VIR_FREE(ctrl->nsFDs);
317
    virCgroupFree(&ctrl->cgroup);
318

319 320
    /* This must always be the last thing to be closed */
    VIR_FORCE_CLOSE(ctrl->handshakeFd);
321 322 323
    VIR_FREE(ctrl);
}

324

325 326 327
static int virLXCControllerAddConsole(virLXCControllerPtr ctrl,
                                      int hostFd)
{
328
    if (VIR_EXPAND_N(ctrl->consoles, ctrl->nconsoles, 1) < 0)
329
        return -1;
330
    ctrl->consoles[ctrl->nconsoles-1].daemon = ctrl->daemon;
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
    ctrl->consoles[ctrl->nconsoles-1].hostFd = hostFd;
    ctrl->consoles[ctrl->nconsoles-1].hostWatch = -1;

    ctrl->consoles[ctrl->nconsoles-1].contFd = -1;
    ctrl->consoles[ctrl->nconsoles-1].contWatch = -1;

    ctrl->consoles[ctrl->nconsoles-1].epollFd = -1;
    ctrl->consoles[ctrl->nconsoles-1].epollWatch = -1;
    return 0;
}


static int virLXCControllerConsoleSetNonblocking(virLXCControllerConsolePtr console)
{
    if (virSetBlocking(console->hostFd, false) < 0 ||
        virSetBlocking(console->contFd, false) < 0) {
        virReportSystemError(errno, "%s",
                             _("Unable to set console file descriptor non-blocking"));
        return -1;
    }

    return 0;
}


356 357 358 359 360 361 362 363 364 365 366 367
static int virLXCControllerDaemonHandshake(virLXCControllerPtr ctrl)
{
    if (lxcContainerSendContinue(ctrl->handshakeFd) < 0) {
        virReportSystemError(errno, "%s",
                             _("error sending continue signal to daemon"));
        return -1;
    }
    VIR_FORCE_CLOSE(ctrl->handshakeFd);
    return 0;
}


368 369 370
static int virLXCControllerValidateNICs(virLXCControllerPtr ctrl)
{
    if (ctrl->def->nnets != ctrl->nveths) {
371
        virReportError(VIR_ERR_INTERNAL_ERROR,
372
                       _("expecting %zu veths, but got %zu"),
373
                       ctrl->def->nnets, ctrl->nveths);
374 375 376 377 378 379 380
        return -1;
    }

    return 0;
}


381 382 383 384
static int virLXCControllerGetNICIndexes(virLXCControllerPtr ctrl)
{
    size_t i;

385 386 387 388 389 390 391 392 393 394
    /* Gather the ifindexes of the "parent" veths for all interfaces
     * implemented with a veth pair. These will be used when calling
     * virCgroupNewMachine (and eventually the dbus method
     * CreateMachineWithNetwork). ifindexes for the child veths, and
     * for macvlan interfaces, *should not* be in this list, as they
     * will be moved into the container. Only the interfaces that will
     * remain outside the container, but are used for communication
     * with the container, should be added to the list.
     */

395 396 397
    VIR_DEBUG("Getting nic indexes");
    for (i = 0; i < ctrl->def->nnets; i++) {
        int nicindex = -1;
398 399 400
        virDomainNetType actualType = virDomainNetGetActualType(ctrl->def->nets[i]);

        switch (actualType) {
401 402
        case VIR_DOMAIN_NET_TYPE_BRIDGE:
        case VIR_DOMAIN_NET_TYPE_NETWORK:
403
        case VIR_DOMAIN_NET_TYPE_ETHERNET:
404 405 406 407
            if (ctrl->def->nets[i]->ifname == NULL)
                continue;
            if (virNetDevGetIndex(ctrl->def->nets[i]->ifname,
                                  &nicindex) < 0)
408
                return -1;
409 410 411
            if (VIR_EXPAND_N(ctrl->nicindexes,
                             ctrl->nnicindexes,
                             1) < 0)
412
                return -1;
413 414 415 416 417
            VIR_DEBUG("Index %d for %s", nicindex,
                      ctrl->def->nets[i]->ifname);
            ctrl->nicindexes[ctrl->nnicindexes-1] = nicindex;
            break;

418 419 420
        case VIR_DOMAIN_NET_TYPE_DIRECT:
           break;

421 422 423 424 425
        case VIR_DOMAIN_NET_TYPE_USER:
        case VIR_DOMAIN_NET_TYPE_VHOSTUSER:
        case VIR_DOMAIN_NET_TYPE_SERVER:
        case VIR_DOMAIN_NET_TYPE_CLIENT:
        case VIR_DOMAIN_NET_TYPE_MCAST:
426
        case VIR_DOMAIN_NET_TYPE_UDP:
427 428
        case VIR_DOMAIN_NET_TYPE_INTERNAL:
        case VIR_DOMAIN_NET_TYPE_HOSTDEV:
429 430
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                           _("Unsupported net type %s"),
431
                           virDomainNetTypeToString(actualType));
432
            return -1;
433
        case VIR_DOMAIN_NET_TYPE_LAST:
434
        default:
435
            virReportEnumRangeError(virDomainNetType, actualType);
436
            return -1;
437 438 439
        }
    }

440
    return 0;
441 442 443
}


444 445 446
static int virLXCControllerValidateConsoles(virLXCControllerPtr ctrl)
{
    if (ctrl->def->nconsoles != ctrl->nconsoles) {
447
        virReportError(VIR_ERR_INTERNAL_ERROR,
448
                       _("expecting %zu consoles, but got %zu tty file handlers"),
449
                       ctrl->def->nconsoles, ctrl->nconsoles);
450 451 452 453 454 455 456
        return -1;
    }

    return 0;
}


457
static int virLXCControllerSetupLoopDeviceFS(virDomainFSDefPtr fs)
458
{
459
    int lofd;
460 461
    char *loname = NULL;

462
    if ((lofd = virFileLoopDeviceAssociate(fs->src->path, &loname)) < 0)
463 464
        return -1;

465
    VIR_DEBUG("Changing fs %s to use type=block for dev %s",
466
              fs->src->path, loname);
467 468 469 470 471
    /*
     * We now change it into a block device type, so that
     * the rest of container setup 'just works'
     */
    fs->type = VIR_DOMAIN_FS_TYPE_BLOCK;
472 473
    VIR_FREE(fs->src->path);
    fs->src->path = loname;
474 475 476 477 478 479
    loname = NULL;

    return lofd;
}


480 481 482 483
static int virLXCControllerSetupLoopDeviceDisk(virDomainDiskDefPtr disk)
{
    int lofd;
    char *loname = NULL;
484
    const char *src = virDomainDiskGetSource(disk);
J
John Ferlan 已提交
485
    int ret = -1;
486

487
    if ((lofd = virFileLoopDeviceAssociate(src, &loname)) < 0)
488 489
        return -1;

490
    VIR_DEBUG("Changing disk %s to use type=block for dev %s",
491
              src, loname);
492

493 494 495 496
    /*
     * We now change it into a block device type, so that
     * the rest of container setup 'just works'
     */
E
Eric Blake 已提交
497
    virDomainDiskSetType(disk, VIR_STORAGE_TYPE_BLOCK);
J
John Ferlan 已提交
498 499 500 501 502 503
    if (virDomainDiskSetSource(disk, loname) < 0)
        goto cleanup;

    ret = 0;

 cleanup:
504
    VIR_FREE(loname);
J
John Ferlan 已提交
505
    if (ret < 0)
506
        VIR_FORCE_CLOSE(lofd);
507 508

    return lofd;
J
John Ferlan 已提交
509

510 511 512
}


513 514 515 516 517 518 519 520 521 522
static int virLXCControllerSetupNBDDeviceFS(virDomainFSDefPtr fs)
{
    char *dev;

    if (fs->format <= VIR_STORAGE_FILE_NONE) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("An explicit disk format must be specified"));
        return -1;
    }

523
    if (virFileNBDDeviceAssociate(fs->src->path,
524 525 526 527 528
                                  fs->format,
                                  fs->readonly,
                                  &dev) < 0)
        return -1;

529
    VIR_DEBUG("Changing fs %s to use type=block for dev %s",
530
              fs->src->path, dev);
531 532 533 534
    /*
     * We now change it into a block device type, so that
     * the rest of container setup 'just works'
     */
535
    fs->type = VIR_DOMAIN_FS_TYPE_BLOCK;
536 537
    VIR_FREE(fs->src->path);
    fs->src->path = dev;
538 539 540 541 542 543 544 545

    return 0;
}


static int virLXCControllerSetupNBDDeviceDisk(virDomainDiskDefPtr disk)
{
    char *dev;
546 547
    const char *src = virDomainDiskGetSource(disk);
    int format = virDomainDiskGetFormat(disk);
548

549
    if (format <= VIR_STORAGE_FILE_NONE) {
550 551 552 553 554
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("An explicit disk format must be specified"));
        return -1;
    }

555 556
    if (virFileNBDDeviceAssociate(src,
                                  format,
557
                                  disk->src->readonly,
558 559 560
                                  &dev) < 0)
        return -1;

561
    VIR_DEBUG("Changing disk %s to use type=block for dev %s",
562
              src, dev);
563 564 565 566
    /*
     * We now change it into a block device type, so that
     * the rest of container setup 'just works'
     */
E
Eric Blake 已提交
567
    virDomainDiskSetType(disk, VIR_STORAGE_TYPE_BLOCK);
568 569 570 571 572
    if (virDomainDiskSetSource(disk, dev) < 0) {
        VIR_FREE(dev);
        return -1;
    }
    VIR_FREE(dev);
573 574 575 576

    return 0;
}

577 578 579 580
static int virLXCControllerAppendNBDPids(virLXCControllerPtr ctrl,
                                         const char *dev)
{
    char *pidpath = NULL;
C
Cédric Bosdonnat 已提交
581 582
    pid_t *pids = NULL;
    size_t npids = 0;
583 584
    size_t i;
    int ret = -1;
585
    size_t loops = 0;
586 587
    pid_t pid;

588
    if (!STRPREFIX(dev, "/dev/"))
589 590
        goto cleanup;

591 592
    pidpath = g_strdup_printf("/sys/devices/virtual/block/%s/pid", dev + 5);

593 594 595 596
    /* Wait for the pid file to appear */
    while (!virFileExists(pidpath)) {
        /* wait for 100ms before checking again, but don't do it for ever */
        if (errno == ENOENT && loops < 10) {
597
            g_usleep(100 * 1000);
598 599 600 601 602 603 604 605 606
            loops++;
        } else {
            virReportSystemError(errno,
                                 _("Cannot check NBD device %s pid"),
                                 dev + 5);
            goto cleanup;
        }
    }

607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
    if (virPidFileReadPath(pidpath, &pid) < 0)
        goto cleanup;

    if (virProcessGetPids(pid, &npids, &pids) < 0)
        goto cleanup;

    for (i = 0; i < npids; i++) {
        if (VIR_APPEND_ELEMENT(ctrl->nbdpids, ctrl->nnbdpids, pids[i]) < 0)
            goto cleanup;
    }

    ret = 0;

 cleanup:
    VIR_FREE(pids);
    VIR_FREE(pidpath);
    return ret;
}
625

626
static int virLXCControllerSetupLoopDevices(virLXCControllerPtr ctrl)
627 628 629
{
    size_t i;

630 631
    VIR_DEBUG("Setting up loop devices for filesystems");

632
    for (i = 0; i < ctrl->def->nfss; i++) {
633
        virDomainFSDefPtr fs = ctrl->def->fss[i];
634 635
        int fd;

636
        if (fs->type != VIR_DOMAIN_FS_TYPE_FILE)
637 638
            continue;

639 640 641 642 643 644 645 646
        if (fs->fsdriver == VIR_DOMAIN_FS_DRIVER_TYPE_DEFAULT) {
            if (fs->format == VIR_STORAGE_FILE_RAW ||
                fs->format == VIR_STORAGE_FILE_NONE)
                fs->fsdriver = VIR_DOMAIN_FS_DRIVER_TYPE_LOOP;
            else
                fs->fsdriver = VIR_DOMAIN_FS_DRIVER_TYPE_NBD;
        }

647 648 649 650 651 652
        if (fs->fsdriver == VIR_DOMAIN_FS_DRIVER_TYPE_LOOP) {
            if (fs->format != VIR_STORAGE_FILE_RAW &&
                fs->format != VIR_STORAGE_FILE_NONE) {
                virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                               _("fs format %s is not supported"),
                               virStorageFileFormatTypeToString(fs->format));
653
                return -1;
654 655 656 657
            }

            fd = virLXCControllerSetupLoopDeviceFS(fs);
            if (fd < 0)
658
                return -1;
659 660 661 662

            VIR_DEBUG("Saving loop fd %d", fd);
            if (VIR_EXPAND_N(ctrl->loopDevFds, ctrl->nloopDevs, 1) < 0) {
                VIR_FORCE_CLOSE(fd);
663
                return -1;
664 665
            }
            ctrl->loopDevFds[ctrl->nloopDevs - 1] = fd;
666 667
        } else if (fs->fsdriver == VIR_DOMAIN_FS_DRIVER_TYPE_NBD) {
            if (virLXCControllerSetupNBDDeviceFS(fs) < 0)
668
                return -1;
669 670 671

            /* The NBD device will be cleaned up while the cgroup will end.
             * For this we need to remember the qemu-nbd pid and add it to
J
Ján Tomko 已提交
672
             * the cgroup */
673
            if (virLXCControllerAppendNBDPids(ctrl, fs->src->path) < 0)
674
                return -1;
675 676 677
        } else {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                           _("fs driver %s is not supported"),
678
                           virDomainFSDriverTypeToString(fs->fsdriver));
679
            return -1;
680 681 682
        }
    }

683 684
    VIR_DEBUG("Setting up loop devices for disks");

685
    for (i = 0; i < ctrl->def->ndisks; i++) {
686 687
        virDomainDiskDefPtr disk = ctrl->def->disks[i];
        int fd;
688 689
        const char *driver = virDomainDiskGetDriver(disk);
        int format = virDomainDiskGetFormat(disk);
690

E
Eric Blake 已提交
691
        if (virDomainDiskGetType(disk) != VIR_STORAGE_TYPE_FILE)
692 693
            continue;

694 695 696 697
        /* If no driverName is set, we prefer 'loop' for
         * dealing with raw or undefined formats, otherwise
         * we use 'nbd'.
         */
698 699 700 701 702 703
        if (STREQ_NULLABLE(driver, "loop") ||
            (!driver &&
             (format == VIR_STORAGE_FILE_RAW ||
              format == VIR_STORAGE_FILE_NONE))) {
            if (format != VIR_STORAGE_FILE_RAW &&
                format != VIR_STORAGE_FILE_NONE) {
704
                virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
705
                               _("disk format %s is not supported"),
706
                               virStorageFileFormatTypeToString(format));
707
                return -1;
708 709
            }

710 711 712 713
            /* We treat 'none' as meaning 'raw' since we
             * don't want to go into the auto-probing
             * business for security reasons
             */
714 715
            fd = virLXCControllerSetupLoopDeviceDisk(disk);
            if (fd < 0)
716
                return -1;
717

718 719 720
            VIR_DEBUG("Saving loop fd %d", fd);
            if (VIR_EXPAND_N(ctrl->loopDevFds, ctrl->nloopDevs, 1) < 0) {
                VIR_FORCE_CLOSE(fd);
721
                return -1;
722 723
            }
            ctrl->loopDevFds[ctrl->nloopDevs - 1] = fd;
724
        } else if (!driver || STREQ(driver, "nbd")) {
725 726 727 728 729
            if (disk->cachemode != VIR_DOMAIN_DISK_CACHE_DEFAULT &&
                disk->cachemode != VIR_DOMAIN_DISK_CACHE_DISABLE) {
                virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                               _("Disk cache mode %s is not supported"),
                               virDomainDiskCacheTypeToString(disk->cachemode));
730
                return -1;
731 732
            }
            if (virLXCControllerSetupNBDDeviceDisk(disk) < 0)
733
                return -1;
734 735 736

            /* The NBD device will be cleaned up while the cgroup will end.
             * For this we need to remember the qemu-nbd pid and add it to
J
Ján Tomko 已提交
737
             * the cgroup */
738
            if (virLXCControllerAppendNBDPids(ctrl, virDomainDiskGetSource(disk)) < 0)
739
                return -1;
740
        } else {
741
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
742
                           _("disk driver %s is not supported"),
743
                           driver);
744
            return -1;
745 746 747
        }
    }

748 749
    VIR_DEBUG("Setup all loop devices");

750
    return 0;
751 752
}

753 754 755 756

/*
 * To be run while still single threaded
 */
757
static int virLXCControllerSetupCpuAffinity(virLXCControllerPtr ctrl)
758
{
H
Hu Tao 已提交
759 760
    int hostcpus, maxcpu = CPU_SETSIZE;
    virBitmapPtr cpumap, cpumapToSet;
761 762 763 764 765

    VIR_DEBUG("Setting CPU affinity");

    /* setaffinity fails if you set bits for CPUs which
     * aren't present, so we have to limit ourselves */
766
    if ((hostcpus = virHostCPUGetCount()) < 0)
767 768
        return -1;

769 770 771
    if (maxcpu > hostcpus)
        maxcpu = hostcpus;

772 773
    cpumap = virBitmapNew(maxcpu);
    if (!cpumap)
774 775
        return -1;

H
Hu Tao 已提交
776 777
    cpumapToSet = cpumap;

778
    if (ctrl->def->cpumask) {
H
Hu Tao 已提交
779
        cpumapToSet = ctrl->def->cpumask;
780 781 782 783 784
    } else {
        /* You may think this is redundant, but we can't assume libvirtd
         * itself is running on all pCPUs, so we need to explicitly set
         * the spawned LXC instance to all pCPUs if no map is given in
         * its config file */
785
        virBitmapSetAll(cpumap);
786 787
    }

788
    /* We are presuming we are running between fork/exec of LXC
789 790 791
     * so use '0' to indicate our own process ID. No threads are
     * running at this point
     */
792
    if (virProcessSetAffinity(0 /* Self */, cpumapToSet) < 0) {
793
        virBitmapFree(cpumap);
794 795
        return -1;
    }
796
    virBitmapFree(cpumap);
797 798 799 800 801

    return 0;
}


802 803 804 805
static int virLXCControllerGetNumadAdvice(virLXCControllerPtr ctrl,
                                          virBitmapPtr *mask)
{
    virBitmapPtr nodemask = NULL;
806
    g_autofree char *nodeset = NULL;
807 808 809 810

    /* Get the advisory nodeset from numad if 'placement' of
     * either <vcpu> or <numatune> is 'auto'.
     */
811
    if (virDomainDefNeedsPlacementAdvice(ctrl->def)) {
812
        nodeset = virNumaGetAutoPlacementAdvice(virDomainDefGetVcpus(ctrl->def),
813 814
                                                ctrl->def->mem.cur_balloon);
        if (!nodeset)
815
            return -1;
816 817 818

        VIR_DEBUG("Nodeset returned from numad: %s", nodeset);

819
        if (virBitmapParse(nodeset, &nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0)
820
            return -1;
821 822 823 824
    }

    *mask = nodemask;

825
    return 0;
826 827 828
}


829
/**
830 831
 * virLXCControllerSetupResourceLimits
 * @ctrl: the controller state
832
 *
833 834 835
 * Sets up the non-cgroup based resource limits that need
 * to be inherited by the child process across clone()/exec().
 * The cgroup limits are setup later
836 837 838
 *
 * Returns 0 on success or -1 in case of error
 */
839
static int virLXCControllerSetupResourceLimits(virLXCControllerPtr ctrl)
840
{
841
    virBitmapPtr auto_nodeset = NULL;
842
    int ret = -1;
843 844 845
    virBitmapPtr nodeset = NULL;
    virDomainNumatuneMemMode mode;

846 847 848 849 850 851 852 853
    if (virDomainNumatuneGetMode(ctrl->def->numa, -1, &mode) == 0) {
        if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
            virCgroupControllerAvailable(VIR_CGROUP_CONTROLLER_CPUSET)) {
            /* Use virNuma* API iff necessary. Once set and child is exec()-ed,
             * there's no way for us to change it. Rely on cgroups (if available
             * and enabled in the config) rather than virNuma*. */
            VIR_DEBUG("Relying on CGroups for memory binding");
        } else {
854

855
            VIR_DEBUG("Setting up process resource limits");
856

857 858
            if (virLXCControllerGetNumadAdvice(ctrl, &auto_nodeset) < 0)
                goto cleanup;
859

860
            nodeset = virDomainNumatuneGetNodeset(ctrl->def->numa, auto_nodeset, -1);
861

862 863 864
            if (virNumaSetupMemoryPolicy(mode, nodeset) < 0)
                goto cleanup;
        }
865
    }
866

867
    if (virLXCControllerSetupCpuAffinity(ctrl) < 0)
868
        goto cleanup;
869

870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885
    ret = 0;
 cleanup:
    virBitmapFree(auto_nodeset);
    return ret;
}


/*
 * Creates the cgroup and sets up the various limits associated
 * with it
 */
static int virLXCControllerSetupCgroupLimits(virLXCControllerPtr ctrl)
{
    virBitmapPtr auto_nodeset = NULL;
    int ret = -1;
    virBitmapPtr nodeset = NULL;
886
    size_t i;
887 888 889 890 891 892

    VIR_DEBUG("Setting up cgroup resource limits");

    if (virLXCControllerGetNumadAdvice(ctrl, &auto_nodeset) < 0)
        goto cleanup;

893
    nodeset = virDomainNumatuneGetNodeset(ctrl->def->numa, auto_nodeset, -1);
894 895

    if (!(ctrl->cgroup = virLXCCgroupCreate(ctrl->def,
896 897 898
                                            ctrl->initpid,
                                            ctrl->nnicindexes,
                                            ctrl->nicindexes)))
899 900
        goto cleanup;

901
    if (virCgroupAddMachineProcess(ctrl->cgroup, getpid()) < 0)
902 903
        goto cleanup;

904 905
    /* Add all qemu-nbd tasks to the cgroup */
    for (i = 0; i < ctrl->nnbdpids; i++) {
906
        if (virCgroupAddMachineProcess(ctrl->cgroup, ctrl->nbdpids[i]) < 0)
907 908 909
            goto cleanup;
    }

910
    if (virLXCCgroupSetup(ctrl->def, ctrl->cgroup, nodeset) < 0)
911
        goto cleanup;
912

913
    ret = 0;
914
 cleanup:
915
    virBitmapFree(auto_nodeset);
916
    return ret;
D
Dan Smith 已提交
917 918
}

919

920 921 922 923 924 925 926 927 928 929 930 931 932
static void virLXCControllerClientCloseHook(virNetServerClientPtr client)
{
    virLXCControllerPtr ctrl = virNetServerClientGetPrivateData(client);

    VIR_DEBUG("Client %p has closed", client);
    if (ctrl->client == client)
        ctrl->client = NULL;
    if (ctrl->inShutdown) {
        VIR_DEBUG("Arm timer to quit event loop");
        virEventUpdateTimeout(ctrl->timerShutdown, 0);
    }
}

933 934
static void virLXCControllerClientPrivateFree(void *data)
{
935 936
    virLXCControllerPtr ctrl = data;
    VIR_DEBUG("Got private data free %p", ctrl);
937 938 939 940
}

static void *virLXCControllerClientPrivateNew(virNetServerClientPtr client,
                                              void *opaque)
941 942
{
    virLXCControllerPtr ctrl = opaque;
943

944 945 946
    virNetServerClientSetCloseHook(client, virLXCControllerClientCloseHook);
    VIR_DEBUG("Got new client %p", client);
    ctrl->client = client;
947 948 949 950 951

    if (ctrl->initpid && ctrl->firstClient)
        virLXCControllerEventSendInit(ctrl, ctrl->initpid);
    ctrl->firstClient = false;

952
    return ctrl;
953 954
}

955 956

static int virLXCControllerSetupServer(virLXCControllerPtr ctrl)
957
{
958
    virNetServerPtr srv = NULL;
959 960
    virNetServerServicePtr svc = NULL;
    char *sockpath;
961

962
    sockpath = g_strdup_printf("%s/%s.sock", LXC_STATE_DIR, ctrl->name);
963

964
    if (!(srv = virNetServerNew("LXC", 1,
965
                                0, 0, 0, 1,
966
                                0, -1, 0,
967 968 969 970
                                virLXCControllerClientPrivateNew,
                                NULL,
                                virLXCControllerClientPrivateFree,
                                ctrl)))
C
Chris Lalancette 已提交
971
        goto error;
972

973 974 975
    if (virSecurityManagerSetSocketLabel(ctrl->securityManager, ctrl->def) < 0)
        goto error;

976 977 978 979
    if (!(svc = virNetServerServiceNewUNIX(sockpath,
                                           0700,
                                           0,
                                           0,
980
                                           NULL,
981
                                           false,
M
Michal Privoznik 已提交
982
                                           0,
983
                                           5)))
984
        goto error;
985

986 987 988
    if (virSecurityManagerClearSocketLabel(ctrl->securityManager, ctrl->def) < 0)
        goto error;

989
    if (virNetServerAddService(srv, svc) < 0)
990
        goto error;
991
    virObjectUnref(svc);
992
    svc = NULL;
993

994 995 996 997
    if (!(ctrl->prog = virNetServerProgramNew(VIR_LXC_MONITOR_PROGRAM,
                                              VIR_LXC_MONITOR_PROGRAM_VERSION,
                                              virLXCMonitorProcs,
                                              virLXCMonitorNProcs)))
998 999
        goto error;

1000
    if (!(ctrl->daemon = virNetDaemonNew()) ||
1001
        virNetDaemonAddServer(ctrl->daemon, srv) < 0)
1002 1003 1004
        goto error;

    virNetDaemonUpdateServices(ctrl->daemon, true);
1005 1006
    VIR_FREE(sockpath);
    return 0;
1007

1008
 error:
1009
    VIR_FREE(sockpath);
1010 1011 1012
    virObjectUnref(srv);
    virObjectUnref(ctrl->daemon);
    ctrl->daemon = NULL;
1013
    virObjectUnref(svc);
1014 1015
    return -1;
}
1016

D
Daniel P. Berrange 已提交
1017 1018 1019

static int lxcControllerClearCapabilities(void)
{
1020
#if WITH_CAPNG
D
Daniel P. Berrange 已提交
1021 1022 1023 1024 1025
    int ret;

    capng_clear(CAPNG_SELECT_BOTH);

    if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
1026 1027
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("failed to apply capabilities: %d"), ret);
D
Daniel P. Berrange 已提交
1028 1029 1030
        return -1;
    }
#else
1031
    VIR_WARN("libcap-ng support not compiled in, unable to clear capabilities");
D
Daniel P. Berrange 已提交
1032 1033 1034 1035
#endif
    return 0;
}

1036
static bool wantReboot;
1037
static virMutex lock = VIR_MUTEX_INITIALIZER;
1038 1039


M
Martin Kletzander 已提交
1040
static void virLXCControllerSignalChildIO(virNetDaemonPtr dmn,
J
Ján Tomko 已提交
1041
                                          siginfo_t *info G_GNUC_UNUSED,
1042
                                          void *opaque)
1043
{
1044
    virLXCControllerPtr ctrl = opaque;
1045
    int ret;
1046
    int status;
1047

1048
    ret = waitpid(-1, &status, WNOHANG);
1049
    VIR_DEBUG("Got sig child %d vs %lld", ret, (long long)ctrl->initpid);
1050
    if (ret == ctrl->initpid) {
M
Martin Kletzander 已提交
1051
        virNetDaemonQuit(dmn);
1052
        virMutexLock(&lock);
1053
        if (WIFSIGNALED(status) &&
1054 1055
            WTERMSIG(status) == SIGHUP) {
            VIR_DEBUG("Status indicates reboot");
1056
            wantReboot = true;
1057
        }
1058 1059
        virMutexUnlock(&lock);
    }
1060 1061 1062
}


1063
static void virLXCControllerConsoleUpdateWatch(virLXCControllerConsolePtr console)
1064 1065 1066 1067
{
    int hostEvents = 0;
    int contEvents = 0;

1068 1069
    /* If host console is open, then we can look to read/write */
    if (!console->hostClosed) {
1070 1071 1072 1073 1074
        if (console->fromHostLen < sizeof(console->fromHostBuf))
            hostEvents |= VIR_EVENT_HANDLE_READABLE;
        if (console->fromContLen)
            hostEvents |= VIR_EVENT_HANDLE_WRITABLE;
    }
1075 1076 1077

    /* If cont console is open, then we can look to read/write */
    if (!console->contClosed) {
1078 1079 1080 1081 1082 1083
        if (console->fromContLen < sizeof(console->fromContBuf))
            contEvents |= VIR_EVENT_HANDLE_READABLE;
        if (console->fromHostLen)
            contEvents |= VIR_EVENT_HANDLE_WRITABLE;
    }

1084 1085 1086
    VIR_DEBUG("Container watch=%d, events=%d closed=%d; host watch=%d events=%d closed=%d",
              console->contWatch, contEvents, console->contClosed,
              console->hostWatch, hostEvents, console->hostClosed);
1087 1088
    virEventUpdateHandle(console->contWatch, contEvents);
    virEventUpdateHandle(console->hostWatch, hostEvents);
1089

1090
    if (console->hostClosed) {
1091
        /* Must setup an epoll to detect when host becomes accessible again */
1092
        int events = EPOLLIN | EPOLLET;
1093
        if (console->fromContLen)
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
            events |= EPOLLOUT;

        if (events != console->hostEpoll) {
            struct epoll_event event;
            int action = EPOLL_CTL_ADD;
            if (console->hostEpoll)
                action = EPOLL_CTL_MOD;

            VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->hostEpoll);

            event.events = events;
            event.data.fd = console->hostFd;
            if (epoll_ctl(console->epollFd, action, console->hostFd, &event) < 0) {
                VIR_DEBUG(":fail");
                virReportSystemError(errno, "%s",
                                     _("Unable to add epoll fd"));
1110
                virNetDaemonQuit(console->daemon);
1111
                return;
1112 1113 1114 1115 1116 1117 1118 1119 1120
            }
            console->hostEpoll = events;
            VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->hostEpoll);
        }
    } else if (console->hostEpoll) {
        VIR_DEBUG("Stop epoll oldContEvents=%x", console->hostEpoll);
        if (epoll_ctl(console->epollFd, EPOLL_CTL_DEL, console->hostFd, NULL) < 0) {
            virReportSystemError(errno, "%s",
                                 _("Unable to remove epoll fd"));
1121
            VIR_DEBUG(":fail");
1122
            virNetDaemonQuit(console->daemon);
1123
            return;
1124 1125 1126
        }
        console->hostEpoll = 0;
    }
1127

1128
    if (console->contClosed) {
1129
        /* Must setup an epoll to detect when guest becomes accessible again */
1130
        int events = EPOLLIN | EPOLLET;
1131
        if (console->fromHostLen)
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147
            events |= EPOLLOUT;

        if (events != console->contEpoll) {
            struct epoll_event event;
            int action = EPOLL_CTL_ADD;
            if (console->contEpoll)
                action = EPOLL_CTL_MOD;

            VIR_DEBUG("newContEvents=%x oldContEvents=%x", events, console->contEpoll);

            event.events = events;
            event.data.fd = console->contFd;
            if (epoll_ctl(console->epollFd, action, console->contFd, &event) < 0) {
                virReportSystemError(errno, "%s",
                                     _("Unable to add epoll fd"));
                VIR_DEBUG(":fail");
1148
                virNetDaemonQuit(console->daemon);
1149
                return;
1150 1151 1152 1153 1154 1155 1156 1157 1158
            }
            console->contEpoll = events;
            VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->contEpoll);
        }
    } else if (console->contEpoll) {
        VIR_DEBUG("Stop epoll oldContEvents=%x", console->contEpoll);
        if (epoll_ctl(console->epollFd, EPOLL_CTL_DEL, console->contFd, NULL) < 0) {
            virReportSystemError(errno, "%s",
                                 _("Unable to remove epoll fd"));
1159
            VIR_DEBUG(":fail");
1160
            virNetDaemonQuit(console->daemon);
1161
            return;
1162 1163 1164 1165
        }
        console->contEpoll = 0;
    }
}
1166 1167


1168
static void virLXCControllerConsoleEPoll(int watch, int fd, int events, void *opaque)
1169
{
1170
    virLXCControllerConsolePtr console = opaque;
1171

1172 1173 1174 1175 1176 1177 1178 1179 1180 1181
    virMutexLock(&lock);
    VIR_DEBUG("IO event watch=%d fd=%d events=%d fromHost=%zu fromcont=%zu",
              watch, fd, events,
              console->fromHostLen,
              console->fromContLen);

    while (1) {
        struct epoll_event event;
        int ret;
        ret = epoll_wait(console->epollFd, &event, 1, 0);
1182
        if (ret < 0) {
S
Stefan Berger 已提交
1183
            if (errno == EINTR)
1184 1185 1186
                continue;
            virReportSystemError(errno, "%s",
                                 _("Unable to wait on epoll"));
1187
            virNetDaemonQuit(console->daemon);
1188 1189 1190
            goto cleanup;
        }

1191 1192 1193 1194 1195 1196 1197
        if (ret == 0)
            break;

        VIR_DEBUG("fd=%d hostFd=%d contFd=%d hostEpoll=%x contEpoll=%x",
                  event.data.fd, console->hostFd, console->contFd,
                  console->hostEpoll, console->contEpoll);

1198 1199
        /* If we get HUP+dead PID, we just re-enable the main loop
         * which will see the PID has died and exit */
1200
        if ((event.events & (EPOLLIN|EPOLLOUT))) {
1201 1202
            if (event.data.fd == console->hostFd) {
                console->hostClosed = false;
1203
            } else {
1204
                console->contClosed = false;
1205
            }
1206
            virLXCControllerConsoleUpdateWatch(console);
1207 1208 1209 1210
            break;
        }
    }

1211
 cleanup:
1212
    virMutexUnlock(&lock);
1213 1214
}

1215
static void virLXCControllerConsoleIO(int watch, int fd, int events, void *opaque)
1216
{
1217
    virLXCControllerConsolePtr console = opaque;
1218 1219

    virMutexLock(&lock);
1220 1221 1222 1223
    VIR_DEBUG("IO event watch=%d fd=%d events=%d fromHost=%zu fromcont=%zu",
              watch, fd, events,
              console->fromHostLen,
              console->fromContLen);
1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
    if (events & VIR_EVENT_HANDLE_READABLE) {
        char *buf;
        size_t *len;
        size_t avail;
        ssize_t done;
        if (watch == console->hostWatch) {
            buf = console->fromHostBuf;
            len = &console->fromHostLen;
            avail = sizeof(console->fromHostBuf) - *len;
        } else {
            buf = console->fromContBuf;
            len = &console->fromContLen;
            avail = sizeof(console->fromContBuf) - *len;
        }
1238
     reread:
1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
        done = read(fd, buf + *len, avail);
        if (done == -1 && errno == EINTR)
            goto reread;
        if (done == -1 && errno != EAGAIN) {
            virReportSystemError(errno, "%s",
                                 _("Unable to read container pty"));
            goto error;
        }
        if (done > 0) {
            *len += done;
        } else {
            VIR_DEBUG("Read fd %d done %d errno %d", fd, (int)done, errno);
        }
    }

    if (events & VIR_EVENT_HANDLE_WRITABLE) {
        char *buf;
        size_t *len;
        ssize_t done;
        if (watch == console->hostWatch) {
            buf = console->fromContBuf;
            len = &console->fromContLen;
        } else {
            buf = console->fromHostBuf;
            len = &console->fromHostLen;
        }

1266
     rewrite:
1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291
        done = write(fd, buf, *len);
        if (done == -1 && errno == EINTR)
            goto rewrite;
        if (done == -1 && errno != EAGAIN) {
            virReportSystemError(errno, "%s",
                                 _("Unable to write to container pty"));
            goto error;
        }
        if (done > 0) {
            memmove(buf, buf + done, (*len - done));
            *len -= done;
        } else {
            VIR_DEBUG("Write fd %d done %d errno %d", fd, (int)done, errno);
        }
    }

    if (events & VIR_EVENT_HANDLE_HANGUP) {
        if (watch == console->hostWatch) {
            console->hostClosed = true;
        } else {
            console->contClosed = true;
        }
        VIR_DEBUG("Got EOF on %d %d", watch, fd);
    }

1292
    virLXCControllerConsoleUpdateWatch(console);
1293 1294 1295
    virMutexUnlock(&lock);
    return;

1296
 error:
1297 1298 1299
    virEventRemoveHandle(console->contWatch);
    virEventRemoveHandle(console->hostWatch);
    console->contWatch = console->hostWatch = -1;
1300
    virNetDaemonQuit(console->daemon);
1301 1302 1303 1304
    virMutexUnlock(&lock);
}


1305
/**
1306
 * lxcControllerMain
1307 1308
 * @serverFd: server socket fd to accept client requests
 * @clientFd: initial client which is the libvirtd daemon
1309
 *
1310
 * Processes I/O on consoles and the monitor
1311 1312 1313
 *
 * Returns 0 on success or -1 in case of error
 */
1314
static int virLXCControllerMain(virLXCControllerPtr ctrl)
1315 1316
{
    int rc = -1;
1317
    size_t i;
1318

1319
    if (virNetDaemonAddSignalHandler(ctrl->daemon,
1320 1321 1322
                                     SIGCHLD,
                                     virLXCControllerSignalChildIO,
                                     ctrl) < 0)
1323 1324
        goto cleanup;

1325 1326
    virResetLastError();

1327
    for (i = 0; i < ctrl->nconsoles; i++) {
1328
        if ((ctrl->consoles[i].epollFd = epoll_create1(EPOLL_CLOEXEC)) < 0) {
1329 1330 1331 1332 1333
            virReportSystemError(errno, "%s",
                                 _("Unable to create epoll fd"));
            goto cleanup;
        }

1334 1335 1336 1337 1338
        if ((ctrl->consoles[i].epollWatch = virEventAddHandle(ctrl->consoles[i].epollFd,
                                                              VIR_EVENT_HANDLE_READABLE,
                                                              virLXCControllerConsoleEPoll,
                                                              &(ctrl->consoles[i]),
                                                              NULL)) < 0) {
1339 1340
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("Unable to watch epoll FD"));
1341 1342 1343
            goto cleanup;
        }

1344 1345 1346 1347 1348
        if ((ctrl->consoles[i].hostWatch = virEventAddHandle(ctrl->consoles[i].hostFd,
                                                             VIR_EVENT_HANDLE_READABLE,
                                                             virLXCControllerConsoleIO,
                                                             &(ctrl->consoles[i]),
                                                             NULL)) < 0) {
1349 1350
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("Unable to watch host console PTY"));
1351 1352 1353
            goto cleanup;
        }

1354 1355 1356 1357 1358
        if ((ctrl->consoles[i].contWatch = virEventAddHandle(ctrl->consoles[i].contFd,
                                                             VIR_EVENT_HANDLE_READABLE,
                                                             virLXCControllerConsoleIO,
                                                             &(ctrl->consoles[i]),
                                                             NULL)) < 0) {
1359 1360
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("Unable to watch host console PTY"));
1361 1362
            goto cleanup;
        }
1363
    }
1364

1365
    virNetDaemonRun(ctrl->daemon);
1366

1367
    if (virGetLastErrorCode() == VIR_ERR_OK)
1368
        rc = wantReboot ? 1 : 0;
1369

1370
 cleanup:
1371
    for (i = 0; i < ctrl->nconsoles; i++)
1372
        virLXCControllerConsoleClose(&(ctrl->consoles[i]));
1373

1374 1375 1376
    return rc;
}

1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390
static unsigned int
virLXCControllerLookupUsernsMap(virDomainIdMapEntryPtr map,
                                int num,
                                unsigned int src)
{
    size_t i;

    for (i = 0; i < num; i++) {
        if (src > map[i].start && src < map[i].start + map[i].count)
            return map[i].target + (src - map[i].start);
    }

    return src;
}
1391

1392 1393 1394 1395 1396 1397
static int
virLXCControllerSetupUsernsMap(virDomainIdMapEntryPtr map,
                               int num,
                               char *path)
{
    virBuffer map_value = VIR_BUFFER_INITIALIZER;
1398 1399
    size_t i;
    int ret = -1;
1400

1401 1402 1403 1404 1405 1406 1407
    /* The kernel supports up to 340 lines in /proc/<pid>/{g,u}id_map */
    if (num > 340) {
        virReportError(VIR_ERR_INVALID_ARG, "%s",
                       _("Too many id mappings defined."));
        goto cleanup;
    }

1408 1409 1410 1411
    for (i = 0; i < num; i++)
        virBufferAsprintf(&map_value, "%u %u %u\n",
                          map[i].start, map[i].target, map[i].count);

1412 1413
    VIR_DEBUG("Set '%s' to '%s'", path, virBufferCurrentContent(&map_value));

1414 1415 1416 1417 1418 1419
    if (virFileWriteStr(path, virBufferCurrentContent(&map_value), 0) < 0) {
        virReportSystemError(errno, _("unable write to %s"), path);
        goto cleanup;
    }

    ret = 0;
1420
 cleanup:
1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
    virBufferFreeAndReset(&map_value);
    return ret;
}

/**
 * virLXCControllerSetupUserns
 *
 * Set proc files for user namespace
 *
 * Returns 0 on success or -1 in case of error
 */
static int virLXCControllerSetupUserns(virLXCControllerPtr ctrl)
{
1434 1435
    g_autofree char *uid_map = NULL;
    g_autofree char *gid_map = NULL;
1436 1437

    /* User namespace is disabled for container */
1438 1439
    if (ctrl->def->idmap.nuidmap == 0) {
        VIR_DEBUG("No uid map, skipping userns setup");
1440
        return 0;
1441
    }
1442

1443
    VIR_DEBUG("Setting up userns maps");
1444
    uid_map = g_strdup_printf("/proc/%d/uid_map", ctrl->initpid);
1445 1446 1447 1448

    if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.uidmap,
                                       ctrl->def->idmap.nuidmap,
                                       uid_map) < 0)
1449
        return -1;
1450

1451
    gid_map = g_strdup_printf("/proc/%d/gid_map", ctrl->initpid);
1452 1453 1454 1455

    if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.gidmap,
                                       ctrl->def->idmap.ngidmap,
                                       gid_map) < 0)
1456
        return -1;
1457

1458
    return 0;
1459 1460
}

1461 1462
static int virLXCControllerSetupDev(virLXCControllerPtr ctrl)
{
1463 1464 1465
    g_autofree char *mount_options = NULL;
    g_autofree char *opts = NULL;
    g_autofree char *dev = NULL;
1466 1467 1468 1469 1470 1471

    VIR_DEBUG("Setting up /dev/ for container");

    mount_options = virSecurityManagerGetMountOptions(ctrl->securityManager,
                                                      ctrl->def);

1472
    dev = g_strdup_printf("/%s/%s.dev", LXC_STATE_DIR, ctrl->def->name);
1473 1474 1475 1476 1477 1478

    /*
     * tmpfs is limited to 64kb, since we only have device nodes in there
     * and don't want to DOS the entire OS RAM usage
     */

1479
    opts = g_strdup_printf("mode=755,size=65536%s", mount_options);
1480

1481
    if (virFileSetupDev(dev, opts) < 0)
1482
        return -1;
1483

1484
    if (lxcContainerChown(ctrl->def, dev) < 0)
1485
        return -1;
1486

1487
    return 0;
1488 1489 1490 1491 1492
}

static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl)
{
    size_t i;
1493
    g_autofree char *path = NULL;
1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504
    const struct {
        int maj;
        int min;
        mode_t mode;
        const char *path;
    } devs[] = {
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/null" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/zero" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/full" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/random" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/urandom" },
1505
        { LXC_DEV_MAJ_TTY, LXC_DEV_MIN_TTY, 0666, "/tty" },
1506 1507 1508
    };

    if (virLXCControllerSetupDev(ctrl) < 0)
1509
        return -1;
1510 1511

    /* Populate /dev/ with a few important bits */
1512
    for (i = 0; i < G_N_ELEMENTS(devs); i++) {
1513 1514
        path = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, ctrl->def->name,
                               devs[i].path);
1515 1516 1517 1518 1519 1520 1521

        dev_t dev = makedev(devs[i].maj, devs[i].min);
        if (mknod(path, S_IFCHR, dev) < 0 ||
            chmod(path, devs[i].mode)) {
            virReportSystemError(errno,
                                 _("Failed to make device %s"),
                                 path);
1522
            return -1;
1523
        }
1524

1525
        if (lxcContainerChown(ctrl->def, path) < 0)
1526
            return -1;
1527 1528
    }

1529
    return 0;
1530
}
1531

1532

1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544
static int
virLXCControllerSetupHostdevSubsysUSB(virDomainDefPtr vmDef,
                                      virDomainHostdevDefPtr def,
                                      virSecurityManagerPtr securityDriver)
{
    int ret = -1;
    char *src = NULL;
    char *dstdir = NULL;
    char *dstfile = NULL;
    char *vroot = NULL;
    struct stat sb;
    mode_t mode;
1545
    virDomainHostdevSubsysUSBPtr usbsrc = &def->source.subsys.u.usb;
1546

1547
    src = g_strdup_printf(USB_DEVFS "/%03d/%03d", usbsrc->bus, usbsrc->device);
1548

1549
    vroot = g_strdup_printf("/%s/%s.dev/bus/usb/", LXC_STATE_DIR, vmDef->name);
1550

1551
    dstdir = g_strdup_printf("%s/%03d/", vroot, usbsrc->bus);
1552

1553
    dstfile = g_strdup_printf("%s/%03d", dstdir, usbsrc->device);
1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584

    if (stat(src, &sb) < 0) {
        virReportSystemError(errno,
                             _("Unable to access %s"), src);
        goto cleanup;
    }

    if (!S_ISCHR(sb.st_mode)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("USB source %s was not a character device"),
                       src);
        goto cleanup;
    }

    mode = 0700 | S_IFCHR;

    if (virFileMakePath(dstdir) < 0) {
        virReportSystemError(errno,
                             _("Unable to create %s"), dstdir);
        goto cleanup;
    }

    VIR_DEBUG("Creating dev %s (%d,%d)",
              dstfile, major(sb.st_rdev), minor(sb.st_rdev));
    if (mknod(dstfile, mode, sb.st_rdev) < 0) {
        virReportSystemError(errno,
                             _("Unable to create device %s"),
                             dstfile);
        goto cleanup;
    }

1585 1586 1587
    if (lxcContainerChown(vmDef, dstfile) < 0)
        goto cleanup;

1588 1589 1590 1591 1592 1593
    if (virSecurityManagerSetHostdevLabel(securityDriver,
                                          vmDef, def, vroot) < 0)
        goto cleanup;

    ret = 0;

1594
 cleanup:
1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621
    VIR_FREE(src);
    VIR_FREE(dstfile);
    VIR_FREE(dstdir);
    VIR_FREE(vroot);
    return ret;
}


static int
virLXCControllerSetupHostdevCapsStorage(virDomainDefPtr vmDef,
                                        virDomainHostdevDefPtr def,
                                        virSecurityManagerPtr securityDriver)
{
    char *dst = NULL;
    char *path = NULL;
    int len = 0;
    int ret = -1;
    struct stat sb;
    mode_t mode;
    char *dev = def->source.caps.u.storage.block;

    if (dev == NULL) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("Missing storage host block path"));
        goto cleanup;
    }

1622
    path = g_strdup(dev);
1623 1624 1625 1626

    while (*(path + len) == '/')
        len++;

1627 1628
    dst = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, vmDef->name,
                          strchr(path + len, '/'));
1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661

    if (stat(dev, &sb) < 0) {
        virReportSystemError(errno,
                             _("Unable to access %s"),
                             dev);
        goto cleanup;
    }

    if (!S_ISBLK(sb.st_mode)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Storage source %s must be a block device"),
                       dev);
        goto cleanup;
    }

    if (lxcContainerSetupHostdevCapsMakePath(dst) < 0) {
        virReportError(errno,
                       _("Failed to create directory for device %s"),
                       dev);
        goto cleanup;
    }

    mode = 0700 | S_IFBLK;

    VIR_DEBUG("Creating dev %s (%d,%d)", dst,
              major(sb.st_rdev), minor(sb.st_rdev));
    if (mknod(dst, mode, sb.st_rdev) < 0) {
        virReportSystemError(errno,
                             _("Unable to create device %s"),
                             dst);
        goto cleanup;
    }

1662 1663 1664
    if (lxcContainerChown(vmDef, dst) < 0)
        goto cleanup;

1665 1666 1667 1668 1669 1670
    def->source.caps.u.storage.block = dst;
    if (virSecurityManagerSetHostdevLabel(securityDriver, vmDef, def, NULL) < 0)
        goto cleanup;

    ret = 0;

1671
 cleanup:
1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697
    def->source.caps.u.storage.block = dev;
    VIR_FREE(dst);
    VIR_FREE(path);
    return ret;
}


static int
virLXCControllerSetupHostdevCapsMisc(virDomainDefPtr vmDef,
                                     virDomainHostdevDefPtr def,
                                     virSecurityManagerPtr securityDriver)
{
    char *dst = NULL;
    char *path = NULL;
    int len = 0;
    int ret = -1;
    struct stat sb;
    mode_t mode;
    char *dev = def->source.caps.u.misc.chardev;

    if (dev == NULL) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("Missing storage host block path"));
        goto cleanup;
    }

1698
    path = g_strdup(dev);
1699 1700 1701 1702

    while (*(path + len) == '/')
        len++;

1703 1704
    dst = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, vmDef->name,
                          strchr(path + len, '/'));
1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737

    if (stat(dev, &sb) < 0) {
        virReportSystemError(errno,
                             _("Unable to access %s"),
                             dev);
        goto cleanup;
    }

    if (!S_ISCHR(sb.st_mode)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Storage source %s must be a character device"),
                       dev);
        goto cleanup;
    }

    if (lxcContainerSetupHostdevCapsMakePath(dst) < 0) {
        virReportError(errno,
                       _("Failed to create directory for device %s"),
                       dst);
        goto cleanup;
    }

    mode = 0700 | S_IFCHR;

    VIR_DEBUG("Creating dev %s (%d,%d)", dst,
              major(sb.st_rdev), minor(sb.st_rdev));
    if (mknod(dst, mode, sb.st_rdev) < 0) {
        virReportSystemError(errno,
                             _("Unable to create device %s"),
                             dev);
        goto cleanup;
    }

1738 1739 1740
    if (lxcContainerChown(vmDef, dst) < 0)
        goto cleanup;

1741 1742 1743 1744 1745 1746
    def->source.caps.u.misc.chardev = dst;
    if (virSecurityManagerSetHostdevLabel(securityDriver, vmDef, def, NULL) < 0)
        goto cleanup;

    ret = 0;

1747
 cleanup:
1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790
    def->source.caps.u.misc.chardev = dev;
    VIR_FREE(dst);
    VIR_FREE(path);
    return ret;
}

static int
virLXCControllerSetupHostdevSubsys(virDomainDefPtr vmDef,
                                   virDomainHostdevDefPtr def,
                                   virSecurityManagerPtr securityDriver)
{
    switch (def->source.subsys.type) {
    case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
        return virLXCControllerSetupHostdevSubsysUSB(vmDef,
                                                     def,
                                                     securityDriver);

    default:
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Unsupported host device mode %s"),
                       virDomainHostdevSubsysTypeToString(def->source.subsys.type));
        return -1;
    }
}


static int
virLXCControllerSetupHostdevCaps(virDomainDefPtr vmDef,
                                 virDomainHostdevDefPtr def,
                                 virSecurityManagerPtr securityDriver)
{
    switch (def->source.subsys.type) {
    case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_STORAGE:
        return virLXCControllerSetupHostdevCapsStorage(vmDef,
                                                       def,
                                                       securityDriver);

    case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_MISC:
        return virLXCControllerSetupHostdevCapsMisc(vmDef,
                                                    def,
                                                    securityDriver);

    case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_NET:
M
Michal Privoznik 已提交
1791
        return 0; /* case is handled in virLXCControllerMoveInterfaces */
1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837

    default:
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Unsupported host device mode %s"),
                       virDomainHostdevCapsTypeToString(def->source.subsys.type));
        return -1;
    }
}


static int
virLXCControllerSetupAllHostdevs(virLXCControllerPtr ctrl)
{
    size_t i;
    virDomainDefPtr vmDef = ctrl->def;
    virSecurityManagerPtr securityDriver = ctrl->securityManager;
    VIR_DEBUG("Setting up hostdevs");

    for (i = 0; i < vmDef->nhostdevs; i++) {
        virDomainHostdevDefPtr def = vmDef->hostdevs[i];
        switch (def->mode) {
        case VIR_DOMAIN_HOSTDEV_MODE_SUBSYS:
            if (virLXCControllerSetupHostdevSubsys(vmDef,
                                                   def,
                                                   securityDriver) < 0)
                return -1;
            break;
        case VIR_DOMAIN_HOSTDEV_MODE_CAPABILITIES:
            if (virLXCControllerSetupHostdevCaps(vmDef,
                                                 def,
                                                 securityDriver) < 0)
                return -1;
            break;
        default:
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                           _("Unsupported host device mode %s"),
                           virDomainHostdevModeTypeToString(def->mode));
            return -1;
        }
    }

    VIR_DEBUG("Setup all hostdevs");
    return 0;
}


1838 1839 1840 1841 1842 1843 1844 1845
static int virLXCControllerSetupDisk(virLXCControllerPtr ctrl,
                                     virDomainDiskDefPtr def,
                                     virSecurityManagerPtr securityDriver)
{
    char *dst = NULL;
    int ret = -1;
    struct stat sb;
    mode_t mode;
1846
    char *tmpsrc = def->src->path;
1847

E
Eric Blake 已提交
1848
    if (virDomainDiskGetType(def) != VIR_STORAGE_TYPE_BLOCK) {
1849 1850 1851 1852
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("Can't setup disk for non-block device"));
        goto cleanup;
    }
1853
    if (!tmpsrc) {
1854 1855 1856 1857 1858
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("Can't setup disk without media"));
        goto cleanup;
    }

1859 1860
    dst = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, ctrl->def->name,
                          def->dst);
1861

1862
    if (stat(def->src->path, &sb) < 0) {
1863
        virReportSystemError(errno,
1864
                             _("Unable to access %s"), tmpsrc);
1865 1866 1867 1868 1869 1870
        goto cleanup;
    }

    if (!S_ISCHR(sb.st_mode) && !S_ISBLK(sb.st_mode)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Disk source %s must be a character/block device"),
1871
                       tmpsrc);
1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888
        goto cleanup;
    }

    mode = 0700;
    if (S_ISCHR(sb.st_mode))
        mode |= S_IFCHR;
    else
        mode |= S_IFBLK;

    /* Yes, the device name we're creating may not
     * actually correspond to the major:minor number
     * we're using, but we've no other option at this
     * time. Just have to hope that containerized apps
     * don't get upset that the major:minor is different
     * to that normally implied by the device name
     */
    VIR_DEBUG("Creating dev %s (%d,%d) from %s",
1889
              dst, major(sb.st_rdev), minor(sb.st_rdev), tmpsrc);
1890 1891 1892 1893 1894 1895 1896
    if (mknod(dst, mode, sb.st_rdev) < 0) {
        virReportSystemError(errno,
                             _("Unable to create device %s"),
                             dst);
        goto cleanup;
    }

1897
    if (lxcContainerChown(ctrl->def, dst) < 0)
1898 1899
        goto cleanup;

1900
    /* Labelling normally operates on src, but we need
E
Eric Blake 已提交
1901
     * to actually label the dst here, so hack the config */
1902
    def->src->path = dst;
1903 1904
    if (virSecurityManagerSetImageLabel(securityDriver, ctrl->def, def->src,
                                        VIR_SECURITY_DOMAIN_IMAGE_LABEL_BACKING_CHAIN) < 0)
1905 1906 1907 1908
        goto cleanup;

    ret = 0;

1909
 cleanup:
1910
    def->src->path = tmpsrc;
1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931
    VIR_FREE(dst);
    return ret;
}

static int virLXCControllerSetupAllDisks(virLXCControllerPtr ctrl)
{
    size_t i;
    VIR_DEBUG("Setting up disks");

    for (i = 0; i < ctrl->def->ndisks; i++) {
        if (virLXCControllerSetupDisk(ctrl, ctrl->def->disks[i],
                                      ctrl->securityManager) < 0)
            return -1;
    }

    VIR_DEBUG("Setup all disks");
    return 0;
}



1932
/**
1933
 * virLXCControllerMoveInterfaces
1934 1935 1936 1937 1938 1939 1940 1941
 * @nveths: number of interfaces
 * @veths: interface names
 * @container: pid of container
 *
 * Moves network interfaces into a container's namespace
 *
 * Returns 0 on success or -1 in case of error
 */
1942
static int virLXCControllerMoveInterfaces(virLXCControllerPtr ctrl)
1943
{
1944
    size_t i;
1945
    virDomainDefPtr def = ctrl->def;
1946

1947
    for (i = 0; i < ctrl->nveths; i++) {
1948
        if (virNetDevSetNamespace(ctrl->veths[i], ctrl->initpid) < 0)
1949
            return -1;
1950
    }
1951

1952 1953 1954 1955 1956 1957 1958 1959 1960
    for (i = 0; i < def->nhostdevs; i ++) {
        virDomainHostdevDefPtr hdev = def->hostdevs[i];

        if (hdev->mode != VIR_DOMAIN_HOSTDEV_MODE_CAPABILITIES)
            continue;

        virDomainHostdevCaps hdcaps = hdev->source.caps;

        if (hdcaps.type != VIR_DOMAIN_HOSTDEV_CAPS_TYPE_NET)
1961
            continue;
1962

1963
        if (virNetDevSetNamespace(hdcaps.u.net.ifname, ctrl->initpid) < 0)
1964 1965 1966
            return -1;
    }

1967 1968 1969 1970 1971
    return 0;
}


/**
1972 1973
 * virLXCControllerDeleteInterfaces:
 * @ctrl: the LXC controller
1974 1975 1976 1977 1978
 *
 * Cleans up the container interfaces by deleting the veth device pairs.
 *
 * Returns 0 on success or -1 in case of error
 */
1979
static int virLXCControllerDeleteInterfaces(virLXCControllerPtr ctrl)
1980
{
1981 1982
    size_t i;
    int ret = 0;
1983

1984
    for (i = 0; i < ctrl->nveths; i++) {
1985 1986 1987 1988 1989
        if (virNetDevVethDelete(ctrl->veths[i]) < 0)
            ret = -1;
    }

    return ret;
1990 1991
}

1992

1993 1994
static int lxcSetPersonality(virDomainDefPtr def)
{
1995
    virArch altArch;
1996

1997
    VIR_DEBUG("Checking for 32-bit personality");
1998
    altArch = lxcContainerGetAlt32bitArch(virArchFromHost());
1999
    if (altArch &&
2000
        (def->os.arch == altArch)) {
2001 2002
        VIR_DEBUG("Setting personality to %s",
                  virArchToString(altArch));
2003 2004
        if (personality(PER_LINUX32) < 0) {
            virReportSystemError(errno, _("Unable to request personality for %s on %s"),
2005 2006
                                 virArchToString(altArch),
                                 virArchToString(virArchFromHost()));
2007 2008 2009 2010 2011 2012
            return -1;
        }
    }
    return 0;
}

2013
#ifndef MS_REC
2014
# define MS_REC          16384
2015 2016 2017
#endif

#ifndef MS_SLAVE
2018
# define MS_SLAVE              (1<<19)
2019
#endif
2020

2021 2022 2023 2024 2025 2026
/* Create a private tty using the private devpts at PTMX, returning
 * the master in *TTYMASTER and the name of the slave, _from the
 * perspective of the guest after remounting file systems_, in
 * *TTYNAME.  Heavily borrowed from glibc, but doesn't require that
 * devpts == "/dev/pts" */
static int
2027 2028
lxcCreateTty(virLXCControllerPtr ctrl, int *ttymaster,
             char **ttyName, char **ttyHostPath)
2029 2030 2031 2032 2033
{
    int ret = -1;
    int ptyno;
    int unlock = 0;

2034
    if ((*ttymaster = open(ctrl->devptmx, O_RDWR|O_NOCTTY|O_NONBLOCK)) < 0)
2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048
        goto cleanup;

    if (ioctl(*ttymaster, TIOCSPTLCK, &unlock) < 0)
        goto cleanup;

    if (ioctl(*ttymaster, TIOCGPTN, &ptyno) < 0)
        goto cleanup;

    /* If mount() succeeded at honoring newinstance, then the kernel
     * was new enough to also honor the mode=0620,gid=5 options, which
     * guarantee that the new pty already has correct permissions; so
     * while glibc has to fstat(), fchmod(), and fchown() for older
     * kernels, we can skip those steps.  ptyno shouldn't currently be
     * anything other than 0, but let's play it safe.  */
2049 2050
    *ttyName = g_strdup_printf("/dev/pts/%d", ptyno);
    *ttyHostPath = g_strdup_printf("/%s/%s.devpts/%d", LXC_STATE_DIR, ctrl->def->name, ptyno);
2051 2052 2053

    ret = 0;

2054
 cleanup:
2055 2056 2057 2058 2059 2060 2061 2062
    if (ret != 0) {
        VIR_FORCE_CLOSE(*ttymaster);
        VIR_FREE(*ttyName);
    }

    return ret;
}

2063

2064 2065 2066
static int
virLXCControllerSetupPrivateNS(void)
{
2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087
    /*
     * If doing a chroot style setup, we need to prepare
     * a private /dev/pts for the child now, which they
     * will later move into position.
     *
     * This is complex because 'virsh console' needs to
     * use /dev/pts from the host OS, and the guest OS
     * needs to use /dev/pts from the guest.
     *
     * This means that we (libvirt_lxc) need to see and
     * use both /dev/pts instances. We're running in the
     * host OS context though and don't want to expose
     * the guest OS /dev/pts there.
     *
     * Thus we call unshare(CLONE_NS) so that we can see
     * the guest's new /dev/pts, without it becoming
     * visible to the host OS. We also put the root FS
     * into slave mode, just in case it was currently
     * marked as shared
     */

2088
    return virProcessSetupPrivateMountNS();
2089 2090 2091
}


2092
static int
2093
virLXCControllerSetupDevPTS(virLXCControllerPtr ctrl)
2094
{
2095 2096 2097
    g_autofree char *mount_options = NULL;
    g_autofree char *opts = NULL;
    g_autofree char *devpts = NULL;
2098
    gid_t ptsgid = 5;
2099

2100
    VIR_DEBUG("Setting up private /dev/pts");
2101

2102 2103
    mount_options = virSecurityManagerGetMountOptions(ctrl->securityManager,
                                                      ctrl->def);
2104

2105 2106
    devpts = g_strdup_printf("%s/%s.devpts", LXC_STATE_DIR, ctrl->def->name);
    ctrl->devptmx = g_strdup_printf("%s/%s.devpts/ptmx", LXC_STATE_DIR, ctrl->def->name);
2107

2108 2109 2110 2111
    if (virFileMakePath(devpts) < 0) {
        virReportSystemError(errno,
                             _("Failed to make path %s"),
                             devpts);
2112
        return -1;
2113
    }
2114

2115 2116 2117 2118 2119
    if (ctrl->def->idmap.ngidmap)
        ptsgid = virLXCControllerLookupUsernsMap(ctrl->def->idmap.gidmap,
                                                 ctrl->def->idmap.ngidmap,
                                                 ptsgid);

2120 2121
    /* XXX should we support gid=X for X!=5 for distros which use
     * a different gid for tty?  */
2122 2123
    opts = g_strdup_printf("newinstance,ptmxmode=0666,mode=0620,gid=%u%s", ptsgid,
                           NULLSTR_EMPTY(mount_options));
2124

2125
    VIR_DEBUG("Mount devpts on %s type=tmpfs flags=0x%x, opts=%s",
2126 2127 2128 2129 2130
              devpts, MS_NOSUID, opts);
    if (mount("devpts", devpts, "devpts", MS_NOSUID, opts) < 0) {
        virReportSystemError(errno,
                             _("Failed to mount devpts on %s"),
                             devpts);
2131
        return -1;
2132
    }
2133

2134
    if (access(ctrl->devptmx, R_OK) < 0) {
2135 2136
        virReportSystemError(ENOSYS, "%s",
                             _("Kernel does not support private devpts"));
2137
        return -1;
2138 2139
    }

2140 2141
    if ((lxcContainerChown(ctrl->def, ctrl->devptmx) < 0) ||
        (lxcContainerChown(ctrl->def, devpts) < 0))
2142
        return -1;
2143

2144
    return 0;
2145 2146 2147
}


G
Gao feng 已提交
2148 2149 2150 2151 2152 2153
static int
virLXCControllerSetupFuse(virLXCControllerPtr ctrl)
{
    return lxcSetupFuse(&ctrl->fuse, ctrl->def);
}

2154 2155 2156 2157 2158 2159
static int
virLXCControllerStartFuse(virLXCControllerPtr ctrl)
{
    return lxcStartFuse(ctrl->fuse);
}

2160 2161 2162 2163 2164
static int
virLXCControllerSetupConsoles(virLXCControllerPtr ctrl,
                              char **containerTTYPaths)
{
    size_t i;
2165
    g_autofree char *ttyHostPath = NULL;
2166

2167
    for (i = 0; i < ctrl->nconsoles; i++) {
2168
        VIR_DEBUG("Opening tty on private %s", ctrl->devptmx);
2169
        if (lxcCreateTty(ctrl,
2170
                         &ctrl->consoles[i].contFd,
2171
                         &containerTTYPaths[i], &ttyHostPath) < 0) {
2172
            virReportSystemError(errno, "%s",
2173
                                 _("Failed to allocate tty"));
2174
            return -1;
2175
        }
2176 2177

        /* Change the owner of tty device to the root user of container */
2178
        if (lxcContainerChown(ctrl->def, ttyHostPath) < 0)
2179
            return -1;
2180 2181

        VIR_FREE(ttyHostPath);
2182
    }
2183

2184
    return 0;
2185 2186 2187
}


2188 2189 2190 2191 2192 2193 2194 2195
static void
virLXCControllerEventSend(virLXCControllerPtr ctrl,
                          int procnr,
                          xdrproc_t proc,
                          void *data)
{
    virNetMessagePtr msg;

2196
    if (!ctrl->client) {
2197
        VIR_WARN("Dropping event %d because libvirtd is not connected", procnr);
2198
        return;
2199
    }
2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218

    VIR_DEBUG("Send event %d client=%p", procnr, ctrl->client);
    if (!(msg = virNetMessageNew(false)))
        goto error;

    msg->header.prog = virNetServerProgramGetID(ctrl->prog);
    msg->header.vers = virNetServerProgramGetVersion(ctrl->prog);
    msg->header.proc = procnr;
    msg->header.type = VIR_NET_MESSAGE;
    msg->header.serial = 1;
    msg->header.status = VIR_NET_OK;

    if (virNetMessageEncodeHeader(msg) < 0)
        goto error;

    if (virNetMessageEncodePayload(msg, proc, data) < 0)
        goto error;

    VIR_DEBUG("Queue event %d %zu", procnr, msg->bufferLength);
2219 2220
    if (virNetServerClientSendMessage(ctrl->client, msg) < 0)
        goto error;
2221 2222 2223 2224

    xdr_free(proc, data);
    return;

2225
 error:
2226 2227 2228 2229 2230 2231 2232 2233 2234
    virNetMessageFree(msg);
    xdr_free(proc, data);
}


static int
virLXCControllerEventSendExit(virLXCControllerPtr ctrl,
                              int exitstatus)
{
2235
    virLXCMonitorExitEventMsg msg;
2236

2237
    VIR_DEBUG("Exit status %d (client=%p)", exitstatus, ctrl->client);
2238 2239 2240
    memset(&msg, 0, sizeof(msg));
    switch (exitstatus) {
    case 0:
2241
        msg.status = VIR_LXC_MONITOR_EXIT_STATUS_SHUTDOWN;
2242
        break;
2243
    case 1:
2244
        msg.status = VIR_LXC_MONITOR_EXIT_STATUS_REBOOT;
2245
        break;
2246
    default:
2247
        msg.status = VIR_LXC_MONITOR_EXIT_STATUS_ERROR;
2248 2249 2250 2251
        break;
    }

    virLXCControllerEventSend(ctrl,
2252 2253
                              VIR_LXC_MONITOR_PROC_EXIT_EVENT,
                              (xdrproc_t)xdr_virLXCMonitorExitEventMsg,
2254 2255 2256 2257 2258 2259
                              (void*)&msg);

    if (ctrl->client) {
        VIR_DEBUG("Waiting for client to complete dispatch");
        ctrl->inShutdown = true;
        virNetServerClientDelayedClose(ctrl->client);
2260
        virNetDaemonRun(ctrl->daemon);
2261 2262 2263 2264 2265 2266
    }
    VIR_DEBUG("Client has gone away");
    return 0;
}


2267 2268 2269 2270
static int
virLXCControllerEventSendInit(virLXCControllerPtr ctrl,
                              pid_t initpid)
{
2271
    virLXCMonitorInitEventMsg msg;
2272

2273
    VIR_DEBUG("Init pid %lld", (long long)initpid);
2274 2275 2276 2277
    memset(&msg, 0, sizeof(msg));
    msg.initpid = initpid;

    virLXCControllerEventSend(ctrl,
2278 2279
                              VIR_LXC_MONITOR_PROC_INIT_EVENT,
                              (xdrproc_t)xdr_virLXCMonitorInitEventMsg,
2280 2281 2282 2283 2284
                              (void*)&msg);
    return 0;
}


2285
static int
2286
virLXCControllerRun(virLXCControllerPtr ctrl)
2287 2288 2289 2290 2291 2292 2293
{
    int rc = -1;
    int control[2] = { -1, -1};
    int containerhandshake[2] = { -1, -1 };
    char **containerTTYPaths = NULL;
    size_t i;

2294
    if (VIR_ALLOC_N(containerTTYPaths, ctrl->nconsoles) < 0)
2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308
        goto cleanup;

    if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) {
        virReportSystemError(errno, "%s",
                             _("sockpair failed"));
        goto cleanup;
    }

    if (socketpair(PF_UNIX, SOCK_STREAM, 0, containerhandshake) < 0) {
        virReportSystemError(errno, "%s",
                             _("socketpair failed"));
        goto cleanup;
    }

2309 2310 2311
    if (virLXCControllerSetupPrivateNS() < 0)
        goto cleanup;

2312 2313 2314
    if (virLXCControllerSetupLoopDevices(ctrl) < 0)
        goto cleanup;

2315
    if (virLXCControllerSetupResourceLimits(ctrl) < 0)
2316 2317 2318 2319 2320
        goto cleanup;

    if (virLXCControllerSetupDevPTS(ctrl) < 0)
        goto cleanup;

2321 2322 2323
    if (virLXCControllerPopulateDevices(ctrl) < 0)
        goto cleanup;

2324 2325 2326
    if (virLXCControllerSetupAllDisks(ctrl) < 0)
        goto cleanup;

2327 2328 2329
    if (virLXCControllerSetupAllHostdevs(ctrl) < 0)
        goto cleanup;

G
Gao feng 已提交
2330 2331 2332
    if (virLXCControllerSetupFuse(ctrl) < 0)
        goto cleanup;

2333 2334
    if (virLXCControllerSetupConsoles(ctrl, containerTTYPaths) < 0)
        goto cleanup;
2335

2336
    if (lxcSetPersonality(ctrl->def) < 0)
2337
        goto cleanup;
2338

2339
    if ((ctrl->initpid = lxcContainerStart(ctrl->def,
2340
                                           ctrl->securityManager,
2341 2342
                                           ctrl->nveths,
                                           ctrl->veths,
2343 2344
                                           ctrl->npassFDs,
                                           ctrl->passFDs,
2345 2346
                                           control[1],
                                           containerhandshake[1],
I
ik.nitk 已提交
2347
                                           ctrl->nsFDs,
2348 2349
                                           ctrl->nconsoles,
                                           containerTTYPaths)) < 0)
2350
        goto cleanup;
2351
    VIR_FORCE_CLOSE(control[1]);
2352
    VIR_FORCE_CLOSE(containerhandshake[1]);
2353

2354 2355 2356
    for (i = 0; i < ctrl->npassFDs; i++)
        VIR_FORCE_CLOSE(ctrl->passFDs[i]);

I
ik.nitk 已提交
2357 2358 2359 2360
    if (ctrl->nsFDs)
        for (i = 0; i < VIR_LXC_DOMAIN_NAMESPACE_LAST; i++)
            VIR_FORCE_CLOSE(ctrl->nsFDs[i]);

2361 2362 2363
    if (virLXCControllerSetupCgroupLimits(ctrl) < 0)
        goto cleanup;

2364 2365 2366
    if (virLXCControllerSetupUserns(ctrl) < 0)
        goto cleanup;

2367
    if (virLXCControllerMoveInterfaces(ctrl) < 0)
2368 2369
        goto cleanup;

2370 2371 2372
    if (virLXCControllerStartFuse(ctrl) < 0)
        goto cleanup;

2373 2374 2375
    if (lxcContainerSendContinue(control[0]) < 0) {
        virReportSystemError(errno, "%s",
                             _("Unable to send container continue message"));
2376
        goto cleanup;
2377
    }
2378

2379 2380 2381 2382 2383 2384
    if (lxcContainerWaitForContinue(containerhandshake[0]) < 0) {
        virReportSystemError(errno, "%s",
                             _("error receiving signal from container"));
        goto cleanup;
    }

2385
    /* ...and reduce our privileges */
D
Daniel P. Berrange 已提交
2386 2387 2388
    if (lxcControllerClearCapabilities() < 0)
        goto cleanup;

2389
    for (i = 0; i < ctrl->nconsoles; i++)
2390
        if (virLXCControllerConsoleSetNonblocking(&(ctrl->consoles[i])) < 0)
2391
            goto cleanup;
2392

2393 2394 2395
    if (virLXCControllerDaemonHandshake(ctrl) < 0)
        goto cleanup;

2396 2397 2398 2399 2400 2401
    /* We must not hold open a dbus connection for life
     * of LXC instance, since dbus-daemon is limited to
     * only a few 100 connections by default
     */
    virDBusCloseSystemBus();

2402
    rc = virLXCControllerMain(ctrl);
2403

2404 2405
    virLXCControllerEventSendExit(ctrl, rc);

2406
 cleanup:
2407 2408
    VIR_FORCE_CLOSE(control[0]);
    VIR_FORCE_CLOSE(control[1]);
2409 2410
    VIR_FORCE_CLOSE(containerhandshake[0]);
    VIR_FORCE_CLOSE(containerhandshake[1]);
2411

2412
    for (i = 0; i < ctrl->nconsoles; i++)
2413 2414
        VIR_FREE(containerTTYPaths[i]);
    VIR_FREE(containerTTYPaths);
2415

2416
    virLXCControllerStopInit(ctrl);
2417

2418 2419 2420 2421
    return rc;
}


2422
int main(int argc, char *argv[])
2423 2424
{
    pid_t pid;
2425
    int rc = -1;
2426
    const char *name = NULL;
2427
    size_t nveths = 0;
2428
    char **veths = NULL;
I
ik.nitk 已提交
2429
    int ns_fd[VIR_LXC_DOMAIN_NAMESPACE_LAST];
2430
    int handshakeFd = -1;
2431
    bool bg = false;
2432
    const struct option options[] = {
2433 2434 2435 2436
        { "background", 0, NULL, 'b' },
        { "name",   1, NULL, 'n' },
        { "veth",   1, NULL, 'v' },
        { "console", 1, NULL, 'c' },
2437
        { "passfd", 1, NULL, 'p' },
2438
        { "handshakefd", 1, NULL, 's' },
2439
        { "security", 1, NULL, 'S' },
I
ik.nitk 已提交
2440 2441 2442
        { "share-net", 1, NULL, 'N' },
        { "share-ipc", 1, NULL, 'I' },
        { "share-uts", 1, NULL, 'U' },
2443 2444 2445
        { "help", 0, NULL, 'h' },
        { 0, 0, 0, 0 },
    };
2446 2447
    int *ttyFDs = NULL;
    size_t nttyFDs = 0;
2448 2449
    int *passFDs = NULL;
    size_t npassFDs = 0;
2450
    virLXCControllerPtr ctrl = NULL;
2451
    size_t i;
2452
    const char *securityDriver = "none";
2453

I
ik.nitk 已提交
2454 2455 2456
    for (i = 0; i < VIR_LXC_DOMAIN_NAMESPACE_LAST; i++)
        ns_fd[i] = -1;

2457
    if (virGettextInitialize() < 0 ||
2458
        virErrorInitialize() < 0) {
E
Eric Blake 已提交
2459 2460 2461 2462
        fprintf(stderr, _("%s: initialization failed\n"), argv[0]);
        exit(EXIT_FAILURE);
    }

2463 2464 2465
    /* Initialize logging */
    virLogSetFromEnv();

2466 2467
    while (1) {
        int c;
2468

I
ik.nitk 已提交
2469
        c = getopt_long(argc, argv, "dn:v:p:m:c:s:h:S:N:I:U:",
2470
                        options, NULL);
2471 2472 2473 2474 2475 2476

        if (c == -1)
            break;

        switch (c) {
        case 'b':
2477
            bg = true;
2478 2479 2480
            break;

        case 'n':
2481
            name = optarg;
2482 2483 2484
            break;

        case 'v':
2485
            if (VIR_REALLOC_N(veths, nveths+1) < 0)
2486
                goto cleanup;
2487
            veths[nveths++] = g_strdup(optarg);
2488 2489 2490
            break;

        case 'c':
2491
            if (VIR_REALLOC_N(ttyFDs, nttyFDs + 1) < 0)
2492 2493
                goto cleanup;
            if (virStrToLong_i(optarg, NULL, 10, &ttyFDs[nttyFDs++]) < 0) {
2494 2495 2496 2497 2498
                fprintf(stderr, "malformed --console argument '%s'", optarg);
                goto cleanup;
            }
            break;

2499 2500 2501 2502 2503 2504 2505 2506 2507
        case 'p':
            if (VIR_REALLOC_N(passFDs, npassFDs + 1) < 0)
                goto cleanup;
            if (virStrToLong_i(optarg, NULL, 10, &passFDs[npassFDs++]) < 0) {
                fprintf(stderr, "malformed --passfd argument '%s'", optarg);
                goto cleanup;
            }
            break;

2508
        case 's':
2509
            if (virStrToLong_i(optarg, NULL, 10, &handshakeFd) < 0) {
2510 2511 2512 2513 2514 2515
                fprintf(stderr, "malformed --handshakefd argument '%s'",
                        optarg);
                goto cleanup;
            }
            break;

I
ik.nitk 已提交
2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539
        case 'N':
            if (virStrToLong_i(optarg, NULL, 10, &ns_fd[VIR_LXC_DOMAIN_NAMESPACE_SHARENET]) < 0) {
                fprintf(stderr, "malformed --share-net argument '%s'",
                        optarg);
                goto cleanup;
            }
            break;

        case 'I':
            if (virStrToLong_i(optarg, NULL, 10, &ns_fd[VIR_LXC_DOMAIN_NAMESPACE_SHAREIPC]) < 0) {
                fprintf(stderr, "malformed --share-ipc argument '%s'",
                        optarg);
                goto cleanup;
            }
            break;

        case 'U':
            if (virStrToLong_i(optarg, NULL, 10, &ns_fd[VIR_LXC_DOMAIN_NAMESPACE_SHAREUTS]) < 0) {
                fprintf(stderr, "malformed --share-uts argument '%s'",
                        optarg);
                goto cleanup;
            }
            break;

2540
        case 'S':
2541
            securityDriver = optarg;
2542 2543
            break;

2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554
        case 'h':
        case '?':
            fprintf(stderr, "\n");
            fprintf(stderr, "syntax: %s [OPTIONS]\n", argv[0]);
            fprintf(stderr, "\n");
            fprintf(stderr, "Options\n");
            fprintf(stderr, "\n");
            fprintf(stderr, "  -b, --background\n");
            fprintf(stderr, "  -n NAME, --name NAME\n");
            fprintf(stderr, "  -c FD, --console FD\n");
            fprintf(stderr, "  -v VETH, --veth VETH\n");
2555
            fprintf(stderr, "  -s FD, --handshakefd FD\n");
2556
            fprintf(stderr, "  -S NAME, --security NAME\n");
I
ik.nitk 已提交
2557 2558 2559
            fprintf(stderr, "  -N FD, --share-net FD\n");
            fprintf(stderr, "  -I FD, --share-ipc FD\n");
            fprintf(stderr, "  -U FD, --share-uts FD\n");
2560 2561
            fprintf(stderr, "  -h, --help\n");
            fprintf(stderr, "\n");
2562
            rc = 0;
2563
            goto cleanup;
2564 2565 2566
        }
    }

2567 2568 2569 2570 2571
    if (name == NULL) {
        fprintf(stderr, "%s: missing --name argument for configuration\n", argv[0]);
        goto cleanup;
    }

2572
    if (handshakeFd < 0) {
2573
        fprintf(stderr, "%s: missing --handshakefd argument for container PTY\n",
2574 2575 2576 2577
                argv[0]);
        goto cleanup;
    }

2578
    if (geteuid() != 0) {
2579 2580 2581
        fprintf(stderr, "%s: must be run as the 'root' user\n", argv[0]);
        goto cleanup;
    }
2582

2583
    virEventRegisterDefaultImpl();
2584 2585

    virDBusSetSharedBus(false);
2586

2587
    if (!(ctrl = virLXCControllerNew(name)))
2588
        goto cleanup;
2589

2590 2591
    ctrl->handshakeFd = handshakeFd;

2592
    if (!(ctrl->securityManager = virSecurityManagerNew(securityDriver,
2593
                                                        LXC_DRIVER_NAME, 0)))
2594 2595
        goto cleanup;

2596 2597 2598 2599 2600 2601 2602 2603 2604
    if (ctrl->def->seclabels) {
        VIR_DEBUG("Security model %s type %s label %s imagelabel %s",
                  NULLSTR(ctrl->def->seclabels[0]->model),
                  virDomainSeclabelTypeToString(ctrl->def->seclabels[0]->type),
                  NULLSTR(ctrl->def->seclabels[0]->label),
                  NULLSTR(ctrl->def->seclabels[0]->imagelabel));
    } else {
        VIR_DEBUG("Security model not initialized");
    }
2605

2606 2607 2608
    ctrl->veths = veths;
    ctrl->nveths = nveths;

2609 2610 2611
    ctrl->passFDs = passFDs;
    ctrl->npassFDs = npassFDs;

I
ik.nitk 已提交
2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624
    for (i = 0; i < VIR_LXC_DOMAIN_NAMESPACE_LAST; i++) {
        if (ns_fd[i] != -1) {
            if (!ctrl->nsFDs) {/*allocate only once */
                size_t j = 0;
                if (VIR_ALLOC_N(ctrl->nsFDs, VIR_LXC_DOMAIN_NAMESPACE_LAST) < 0)
                    goto cleanup;
                for (j = 0; j < VIR_LXC_DOMAIN_NAMESPACE_LAST; j++)
                    ctrl->nsFDs[j] = -1;
            }
            ctrl->nsFDs[i] = ns_fd[i];
        }
    }

2625
    for (i = 0; i < nttyFDs; i++) {
2626 2627 2628 2629 2630
        if (virLXCControllerAddConsole(ctrl, ttyFDs[i]) < 0)
            goto cleanup;
        ttyFDs[i] = -1;
    }

2631
    if (virLXCControllerValidateNICs(ctrl) < 0)
2632
        goto cleanup;
2633

2634 2635 2636
    if (virLXCControllerGetNICIndexes(ctrl) < 0)
        goto cleanup;

2637 2638 2639
    if (virLXCControllerValidateConsoles(ctrl) < 0)
        goto cleanup;

2640
    if (virLXCControllerSetupServer(ctrl) < 0)
2641
        goto cleanup;
2642

2643 2644 2645
    if (bg) {
        if ((pid = fork()) < 0)
            goto cleanup;
2646

2647
        if (pid > 0) {
2648
            if ((rc = virPidFileWrite(LXC_STATE_DIR, name, pid)) < 0) {
2649
                virReportSystemError(-rc,
2650 2651
                                     _("Unable to write pid file '%s/%s.pid'"),
                                     LXC_STATE_DIR, name);
2652 2653
                _exit(1);
            }
2654

2655 2656 2657 2658
            /* First child now exits, allowing original caller
             * (ie libvirtd's LXC driver to complete their
             * waitpid & continue */
            _exit(0);
2659 2660
        }

E
Eric Blake 已提交
2661
        /* Don't hold on to any cwd we inherit from libvirtd either */
2662
        if (chdir("/") < 0) {
2663
            virReportSystemError(errno, "%s",
2664
                                 _("Unable to change to root dir"));
2665 2666 2667 2668
            goto cleanup;
        }

        if (setsid() < 0) {
2669
            virReportSystemError(errno, "%s",
2670
                                 _("Unable to become session leader"));
2671 2672 2673
            goto cleanup;
        }
    }
2674

2675
    rc = virLXCControllerRun(ctrl);
2676

2677
 cleanup:
2678
    if (rc < 0) {
2679 2680 2681
        fprintf(stderr,
                _("Failure in libvirt_lxc startup: %s\n"),
                virGetLastErrorMessage());
2682 2683
    }

2684
    virPidFileDelete(LXC_STATE_DIR, name);
2685 2686
    if (ctrl)
        virLXCControllerDeleteInterfaces(ctrl);
2687
    for (i = 0; i < nttyFDs; i++)
2688 2689
        VIR_FORCE_CLOSE(ttyFDs[i]);
    VIR_FREE(ttyFDs);
2690 2691 2692
    for (i = 0; i < npassFDs; i++)
        VIR_FORCE_CLOSE(passFDs[i]);
    VIR_FREE(passFDs);
2693

2694
    virLXCControllerFree(ctrl);
2695

2696
    return rc < 0? EXIT_FAILURE : EXIT_SUCCESS;
2697
}