virpci.c 75.7 KB
Newer Older
1
/*
2 3
 * virpci.c: helper APIs for managing host PCI devices
 *
E
Eric Blake 已提交
4
 * Copyright (C) 2009-2013 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library.  If not, see
O
Osier Yang 已提交
18
 * <http://www.gnu.org/licenses/>.
19 20 21 22 23 24 25
 *
 * Authors:
 *     Mark McLoughlin <markmc@redhat.com>
 */

#include <config.h>

26
#include "virpci.h"
27 28 29 30 31 32 33 34 35 36

#include <dirent.h>
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
37
#include <stdlib.h>
38

39
#include "dirname.h"
40
#include "virlog.h"
41
#include "viralloc.h"
42
#include "vircommand.h"
43
#include "virerror.h"
E
Eric Blake 已提交
44
#include "virfile.h"
45
#include "virkmod.h"
46 47
#include "virstring.h"
#include "virutil.h"
48 49 50 51 52

#define PCI_SYSFS "/sys/bus/pci/"
#define PCI_ID_LEN 10   /* "XXXX XXXX" */
#define PCI_ADDR_LEN 13 /* "XXXX:XX:XX.X" */

53
struct _virPCIDevice {
54 55 56 57
    unsigned int  domain;
    unsigned int  bus;
    unsigned int  slot;
    unsigned int  function;
58 59 60

    char          name[PCI_ADDR_LEN]; /* domain:bus:slot.function */
    char          id[PCI_ID_LEN];     /* product vendor */
E
Eric Blake 已提交
61
    char          *path;
C
Chunyan Liu 已提交
62 63 64 65

    /* The driver:domain which uses the device */
    char          *used_by_drvname;
    char          *used_by_domname;
66

67 68
    unsigned int  pcie_cap_pos;
    unsigned int  pci_pm_cap_pos;
69 70
    bool          has_flr;
    bool          has_pm_reset;
71
    bool          managed;
72
    char          *stubDriver;
73 74

    /* used by reattach function */
75 76 77
    bool          unbind_from_stub;
    bool          remove_slot;
    bool          reprobe;
78 79
};

80
struct _virPCIDeviceList {
81 82
    virObjectLockable parent;

83
    size_t count;
84
    virPCIDevicePtr *devs;
85 86 87
};


88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
/* For virReportOOMError()  and virReportSystemError() */
#define VIR_FROM_THIS VIR_FROM_NONE

/* Specifications referenced in comments:
 *  PCI30  - PCI Local Bus Specification 3.0
 *  PCIe20 - PCI Express Base Specification 2.0
 *  BR12   - PCI-to-PCI Bridge Architecture Specification 1.2
 *  PM12   - PCI Bus Power Management Interface Specification 1.2
 *  ECN_AF - Advanced Capabilities for Conventional PCI ECN
 */

/* Type 0 config space header length; PCI30 Section 6.1 Configuration Space Organization */
#define PCI_CONF_LEN            0x100
#define PCI_CONF_HEADER_LEN     0x40

/* PCI30 6.2.1 */
#define PCI_HEADER_TYPE         0x0e    /* Header type */
105 106 107
#define PCI_HEADER_TYPE_BRIDGE 0x1
#define PCI_HEADER_TYPE_MASK   0x7f
#define PCI_HEADER_TYPE_MULTI  0x80
108 109 110 111 112 113 114 115 116

/* PCI30 6.2.1  Device Identification */
#define PCI_CLASS_DEVICE        0x0a    /* Device class */

/* Class Code for bridge; PCI30 D.7  Base Class 06h */
#define PCI_CLASS_BRIDGE_PCI    0x0604

/* PCI30 6.2.3  Device Status */
#define PCI_STATUS              0x06    /* 16 bits */
117
#define PCI_STATUS_CAP_LIST    0x10    /* Support Capability List */
118 119 120 121 122 123 124 125 126 127 128 129 130

/* PCI30 6.7  Capabilities List */
#define PCI_CAPABILITY_LIST     0x34    /* Offset of first capability list entry */

/* PM12 3.2.1  Capability Identifier */
#define PCI_CAP_ID_PM           0x01    /* Power Management */
/* PCI30 H Capability IDs */
#define PCI_CAP_ID_EXP          0x10    /* PCI Express */
/* ECN_AF 6.x.1.1  Capability ID for AF */
#define PCI_CAP_ID_AF           0x13    /* Advanced Features */

/* PCIe20 7.8.3  Device Capabilities Register (Offset 04h) */
#define PCI_EXP_DEVCAP          0x4     /* Device capabilities */
131
#define PCI_EXP_DEVCAP_FLR     (1<<28) /* Function Level Reset */
132 133 134 135 136 137 138

/* Header type 1 BR12 3.2 PCI-to-PCI Bridge Configuration Space Header Format */
#define PCI_PRIMARY_BUS         0x18    /* BR12 3.2.5.2 Primary bus number */
#define PCI_SECONDARY_BUS       0x19    /* BR12 3.2.5.3 Secondary bus number */
#define PCI_SUBORDINATE_BUS     0x1a    /* BR12 3.2.5.4 Highest bus number behind the bridge */
#define PCI_BRIDGE_CONTROL      0x3e
/* BR12 3.2.5.18  Bridge Control Register */
139
#define PCI_BRIDGE_CTL_RESET   0x40    /* Secondary bus reset */
140 141 142

/* PM12 3.2.4  Power Management Control/Status (Offset = 4) */
#define PCI_PM_CTRL                4    /* PM control and status register */
143 144 145 146
#define PCI_PM_CTRL_STATE_MASK    0x3  /* Current power state (D0 to D3) */
#define PCI_PM_CTRL_STATE_D0      0x0  /* D0 state */
#define PCI_PM_CTRL_STATE_D3hot   0x3  /* D3 state */
#define PCI_PM_CTRL_NO_SOFT_RESET 0x8  /* No reset for D3hot->D0 */
147 148 149

/* ECN_AF 6.x.1  Advanced Features Capability Structure */
#define PCI_AF_CAP              0x3     /* Advanced features capabilities */
150
#define PCI_AF_CAP_FLR         0x2     /* Function Level Reset */
151

J
Jiri Denemark 已提交
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
#define PCI_EXP_FLAGS           0x2
#define PCI_EXP_FLAGS_TYPE      0x00f0
#define PCI_EXP_TYPE_DOWNSTREAM 0x6

#define PCI_EXT_CAP_BASE          0x100
#define PCI_EXT_CAP_LIMIT         0x1000
#define PCI_EXT_CAP_ID_MASK       0x0000ffff
#define PCI_EXT_CAP_OFFSET_SHIFT  20
#define PCI_EXT_CAP_OFFSET_MASK   0x00000ffc

#define PCI_EXT_CAP_ID_ACS      0x000d
#define PCI_EXT_ACS_CTRL        0x06

#define PCI_EXT_CAP_ACS_SV      0x01
#define PCI_EXT_CAP_ACS_RR      0x04
#define PCI_EXT_CAP_ACS_CR      0x08
#define PCI_EXT_CAP_ACS_UF      0x10
169 170 171
#define PCI_EXT_CAP_ACS_ENABLED (PCI_EXT_CAP_ACS_SV |   \
                                 PCI_EXT_CAP_ACS_RR |   \
                                 PCI_EXT_CAP_ACS_CR |   \
J
Jiri Denemark 已提交
172 173
                                 PCI_EXT_CAP_ACS_UF)

174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
static virClassPtr virPCIDeviceListClass;

static void virPCIDeviceListDispose(void *obj);

static int virPCIOnceInit(void)
{
    if (!(virPCIDeviceListClass = virClassNew(virClassForObjectLockable(),
                                              "virPCIDeviceList",
                                              sizeof(virPCIDeviceList),
                                              virPCIDeviceListDispose)))
        return -1;

    return 0;
}

VIR_ONCE_GLOBAL_INIT(virPCI)

L
Laine Stump 已提交
191 192 193 194 195 196

static int
virPCIDriverDir(char **buffer, const char *driver)
{
    VIR_FREE(*buffer);

197 198 199
    if (virAsprintf(buffer, PCI_SYSFS "drivers/%s", driver) < 0)
        return -1;
    return 0;
L
Laine Stump 已提交
200 201 202 203 204 205 206 207
}


static int
virPCIDriverFile(char **buffer, const char *driver, const char *file)
{
    VIR_FREE(*buffer);

208 209 210
    if (virAsprintf(buffer, PCI_SYSFS "drivers/%s/%s", driver, file) < 0)
        return -1;
    return 0;
L
Laine Stump 已提交
211 212 213 214 215 216 217 218
}


static int
virPCIFile(char **buffer, const char *device, const char *file)
{
    VIR_FREE(*buffer);

219 220 221
    if (virAsprintf(buffer, PCI_SYSFS "devices/%s/%s", device, file) < 0)
        return -1;
    return 0;
L
Laine Stump 已提交
222 223 224 225 226 227 228 229 230 231
}


/* virPCIDeviceGetDriverPathAndName - put the path to the driver
 * directory of the driver in use for this device in @path and the
 * name of the driver in @name. Both could be NULL if it's not bound
 * to any driver.
 *
 * Return 0 for success, -1 for error.
 */
232
int
L
Laine Stump 已提交
233 234 235 236 237 238 239 240 241 242
virPCIDeviceGetDriverPathAndName(virPCIDevicePtr dev, char **path, char **name)
{
    int ret = -1;
    char *drvlink = NULL;

    *path = *name = NULL;
    /* drvlink = "/sys/bus/pci/dddd:bb:ss.ff/driver" */
    if (virPCIFile(&drvlink, dev->name, "driver") < 0)
        goto cleanup;

243 244 245 246 247
    if (!virFileExists(drvlink)) {
        ret = 0;
        goto cleanup;
    }

L
Laine Stump 已提交
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
    if (virFileIsLink(drvlink) != 1) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Invalid device %s driver file %s is not a symlink"),
                       dev->name, drvlink);
        goto cleanup;
    }
    if (virFileResolveLink(drvlink, path) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s driver symlink %s"),
                       dev->name, drvlink);
        goto cleanup;
    }
    /* path = "/sys/bus/pci/drivers/${drivername}" */

    if (VIR_STRDUP(*name, last_component(*path)) < 0)
        goto cleanup;
    /* name = "${drivername}" */

    ret = 0;
cleanup:
    VIR_FREE(drvlink);
    if (ret < 0) {
        VIR_FREE(*path);
        VIR_FREE(*name);
    }
    return ret;
}


277
static int
278
virPCIDeviceConfigOpen(virPCIDevicePtr dev, bool fatal)
279 280 281 282
{
    int fd;

    fd = open(dev->path, O_RDWR);
283

284
    if (fd < 0) {
285 286 287 288 289 290 291 292 293
        if (fatal) {
            virReportSystemError(errno,
                                 _("Failed to open config space file '%s'"),
                                 dev->path);
        } else {
            char ebuf[1024];
            VIR_WARN("Failed to open config space file '%s': %s",
                     dev->path, virStrerror(errno, ebuf, sizeof(ebuf)));
        }
294 295
        return -1;
    }
296

297
    VIR_DEBUG("%s %s: opened %s", dev->id, dev->name, dev->path);
298
    return fd;
299 300
}

301
static void
302
virPCIDeviceConfigClose(virPCIDevicePtr dev, int cfgfd)
303
{
304 305 306 307 308
    if (VIR_CLOSE(cfgfd) < 0) {
        char ebuf[1024];
        VIR_WARN("Failed to close config space file '%s': %s",
                 dev->path, virStrerror(errno, ebuf, sizeof(ebuf)));
    }
309 310
}

311

312
static int
313 314
virPCIDeviceRead(virPCIDevicePtr dev,
                 int cfgfd,
315
                 unsigned int pos,
316
                 uint8_t *buf,
317
                 unsigned int buflen)
318 319 320
{
    memset(buf, 0, buflen);

321 322
    if (lseek(cfgfd, pos, SEEK_SET) != pos ||
        saferead(cfgfd, buf, buflen) != buflen) {
323
        char ebuf[1024];
324
        VIR_WARN("Failed to read from '%s' : %s", dev->path,
325 326 327 328 329 330 331
                 virStrerror(errno, ebuf, sizeof(ebuf)));
        return -1;
    }
    return 0;
}

static uint8_t
332
virPCIDeviceRead8(virPCIDevicePtr dev, int cfgfd, unsigned int pos)
333 334
{
    uint8_t buf;
335
    virPCIDeviceRead(dev, cfgfd, pos, &buf, sizeof(buf));
336 337 338 339
    return buf;
}

static uint16_t
340
virPCIDeviceRead16(virPCIDevicePtr dev, int cfgfd, unsigned int pos)
341 342
{
    uint8_t buf[2];
343
    virPCIDeviceRead(dev, cfgfd, pos, &buf[0], sizeof(buf));
344 345 346 347
    return (buf[0] << 0) | (buf[1] << 8);
}

static uint32_t
348
virPCIDeviceRead32(virPCIDevicePtr dev, int cfgfd, unsigned int pos)
349 350
{
    uint8_t buf[4];
351
    virPCIDeviceRead(dev, cfgfd, pos, &buf[0], sizeof(buf));
352 353 354
    return (buf[0] << 0) | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
}

355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
static int
virPCIDeviceReadClass(virPCIDevicePtr dev, uint16_t *device_class)
{
    char *path = NULL;
    char *id_str = NULL;
    int ret = -1;
    unsigned int value;

    if (virPCIFile(&path, dev->name, "class") < 0)
        return ret;

    /* class string is '0xNNNNNN\n' ... i.e. 9 bytes */
    if (virFileReadAll(path, 9, &id_str) < 0)
        goto cleanup;

    id_str[8] = '\0';
    if (virStrToLong_ui(id_str, NULL, 16, &value) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unusual value in %s/devices/%s/class: %s"),
                       PCI_SYSFS, dev->name, id_str);
        goto cleanup;
    }

    *device_class = (value >> 8) & 0xFFFF;
    ret = 0;
cleanup:
    VIR_FREE(id_str);
    VIR_FREE(path);
    return ret;
}

386
static int
387 388
virPCIDeviceWrite(virPCIDevicePtr dev,
                  int cfgfd,
389
                  unsigned int pos,
390
                  uint8_t *buf,
391
                  unsigned int buflen)
392
{
393 394
    if (lseek(cfgfd, pos, SEEK_SET) != pos ||
        safewrite(cfgfd, buf, buflen) != buflen) {
395
        char ebuf[1024];
396
        VIR_WARN("Failed to write to '%s' : %s", dev->path,
397 398 399 400 401 402 403
                 virStrerror(errno, ebuf, sizeof(ebuf)));
        return -1;
    }
    return 0;
}

static void
404
virPCIDeviceWrite16(virPCIDevicePtr dev, int cfgfd, unsigned int pos, uint16_t val)
405 406
{
    uint8_t buf[2] = { (val >> 0), (val >> 8) };
407
    virPCIDeviceWrite(dev, cfgfd, pos, &buf[0], sizeof(buf));
408 409 410
}

static void
411
virPCIDeviceWrite32(virPCIDevicePtr dev, int cfgfd, unsigned int pos, uint32_t val)
412
{
413
    uint8_t buf[4] = { (val >> 0), (val >> 8), (val >> 16), (val >> 24) };
414
    virPCIDeviceWrite(dev, cfgfd, pos, &buf[0], sizeof(buf));
415 416
}

E
Eric Blake 已提交
417 418
typedef int (*virPCIDeviceIterPredicate)(virPCIDevicePtr, virPCIDevicePtr,
                                         void *);
419 420 421 422 423 424 425

/* Iterate over available PCI devices calling @predicate
 * to compare each one to @dev.
 * Return -1 on error since we don't want to assume it is
 * safe to reset if there is an error.
 */
static int
426 427 428 429
virPCIDeviceIterDevices(virPCIDeviceIterPredicate predicate,
                        virPCIDevicePtr dev,
                        virPCIDevicePtr *matched,
                        void *data)
430 431 432
{
    DIR *dir;
    struct dirent *entry;
433
    int ret = 0;
434
    int rc;
435 436 437 438 439 440 441

    *matched = NULL;

    VIR_DEBUG("%s %s: iterating over " PCI_SYSFS "devices", dev->id, dev->name);

    dir = opendir(PCI_SYSFS "devices");
    if (!dir) {
442
        VIR_WARN("Failed to open " PCI_SYSFS "devices");
443 444 445 446
        return -1;
    }

    while ((entry = readdir(dir))) {
447
        unsigned int domain, bus, slot, function;
448
        virPCIDevicePtr check;
449
        char *tmp;
450 451 452 453 454

        /* Ignore '.' and '..' */
        if (entry->d_name[0] == '.')
            continue;

455 456 457 458 459 460 461 462 463
        /* expected format: <domain>:<bus>:<slot>.<function> */
        if (/* domain */
            virStrToLong_ui(entry->d_name, &tmp, 16, &domain) < 0 || *tmp != ':' ||
            /* bus */
            virStrToLong_ui(tmp + 1, &tmp, 16, &bus) < 0 || *tmp != ':' ||
            /* slot */
            virStrToLong_ui(tmp + 1, &tmp, 16, &slot) < 0 || *tmp != '.' ||
            /* function */
            virStrToLong_ui(tmp + 1, NULL, 16, &function) < 0) {
464 465 466 467
            VIR_WARN("Unusual entry in " PCI_SYSFS "devices: %s", entry->d_name);
            continue;
        }

468
        check = virPCIDeviceNew(domain, bus, slot, function);
469
        if (!check) {
470 471 472
            ret = -1;
            break;
        }
473

474 475 476
        rc = predicate(dev, check, data);
        if (rc < 0) {
            /* the predicate returned an error, bail */
477
            virPCIDeviceFree(check);
478 479 480 481
            ret = -1;
            break;
        }
        else if (rc == 1) {
482 483
            VIR_DEBUG("%s %s: iter matched on %s", dev->id, dev->name, check->name);
            *matched = check;
484
            ret = 1;
485 486
            break;
        }
487

488
        virPCIDeviceFree(check);
489 490
    }
    closedir(dir);
491
    return ret;
492 493 494
}

static uint8_t
495 496 497
virPCIDeviceFindCapabilityOffset(virPCIDevicePtr dev,
                                 int cfgfd,
                                 unsigned int capability)
498 499 500 501
{
    uint16_t status;
    uint8_t pos;

502
    status = virPCIDeviceRead16(dev, cfgfd, PCI_STATUS);
503 504 505
    if (!(status & PCI_STATUS_CAP_LIST))
        return 0;

506
    pos = virPCIDeviceRead8(dev, cfgfd, PCI_CAPABILITY_LIST);
507 508 509 510 511 512 513 514 515

    /* Zero indicates last capability, capabilities can't
     * be in the config space header and 0xff is returned
     * by the kernel if we don't have access to this region
     *
     * Note: we're not handling loops or extended
     * capabilities here.
     */
    while (pos >= PCI_CONF_HEADER_LEN && pos != 0xff) {
516
        uint8_t capid = virPCIDeviceRead8(dev, cfgfd, pos);
517 518 519 520 521 522
        if (capid == capability) {
            VIR_DEBUG("%s %s: found cap 0x%.2x at 0x%.2x",
                      dev->id, dev->name, capability, pos);
            return pos;
        }

523
        pos = virPCIDeviceRead8(dev, cfgfd, pos + 1);
524 525 526 527 528 529 530
    }

    VIR_DEBUG("%s %s: failed to find cap 0x%.2x", dev->id, dev->name, capability);

    return 0;
}

J
Jiri Denemark 已提交
531
static unsigned int
532 533
virPCIDeviceFindExtendedCapabilityOffset(virPCIDevicePtr dev,
                                         int cfgfd,
534
                                         unsigned int capability)
J
Jiri Denemark 已提交
535 536 537 538 539 540 541 542 543 544
{
    int ttl;
    unsigned int pos;
    uint32_t header;

    /* minimum 8 bytes per capability */
    ttl = (PCI_EXT_CAP_LIMIT - PCI_EXT_CAP_BASE) / 8;
    pos = PCI_EXT_CAP_BASE;

    while (ttl > 0 && pos >= PCI_EXT_CAP_BASE) {
545
        header = virPCIDeviceRead32(dev, cfgfd, pos);
J
Jiri Denemark 已提交
546 547 548 549 550 551 552 553 554 555 556

        if ((header & PCI_EXT_CAP_ID_MASK) == capability)
            return pos;

        pos = (header >> PCI_EXT_CAP_OFFSET_SHIFT) & PCI_EXT_CAP_OFFSET_MASK;
        ttl--;
    }

    return 0;
}

557 558 559 560
/* detects whether this device has FLR.  Returns 0 if the device does
 * not have FLR, 1 if it does, and -1 on error
 */
static int
561
virPCIDeviceDetectFunctionLevelReset(virPCIDevicePtr dev, int cfgfd)
562
{
M
Mark McLoughlin 已提交
563
    uint32_t caps;
564
    uint8_t pos;
565 566
    char *path;
    int found;
567 568 569 570 571 572 573 574

    /* The PCIe Function Level Reset capability allows
     * individual device functions to be reset without
     * affecting any other functions on the device or
     * any other devices on the bus. This is only common
     * on SR-IOV NICs at the moment.
     */
    if (dev->pcie_cap_pos) {
575
        caps = virPCIDeviceRead32(dev, cfgfd, dev->pcie_cap_pos + PCI_EXP_DEVCAP);
576 577 578 579 580 581 582 583 584 585
        if (caps & PCI_EXP_DEVCAP_FLR) {
            VIR_DEBUG("%s %s: detected PCIe FLR capability", dev->id, dev->name);
            return 1;
        }
    }

    /* The PCI AF Function Level Reset capability is
     * the same thing, except for conventional PCI
     * devices. This is not common yet.
     */
586
    pos = virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_AF);
587
    if (pos) {
588
        caps = virPCIDeviceRead16(dev, cfgfd, pos + PCI_AF_CAP);
589 590 591 592 593 594
        if (caps & PCI_AF_CAP_FLR) {
            VIR_DEBUG("%s %s: detected PCI FLR capability", dev->id, dev->name);
            return 1;
        }
    }

595 596 597 598 599 600
    /* there are some buggy devices that do support FLR, but forget to
     * advertise that fact in their capabilities.  However, FLR is *required*
     * to be present for virtual functions (VFs), so if we see that this
     * device is a VF, we just assume FLR works
     */

601
    if (virAsprintf(&path, PCI_SYSFS "devices/%s/physfn", dev->name) < 0)
602 603 604 605 606 607 608 609 610 611
        return -1;

    found = virFileExists(path);
    VIR_FREE(path);
    if (found) {
        VIR_DEBUG("%s %s: buggy device didn't advertise FLR, but is a VF; forcing flr on",
                  dev->id, dev->name);
        return 1;
    }

612 613 614 615 616 617 618 619 620
    VIR_DEBUG("%s %s: no FLR capability found", dev->id, dev->name);

    return 0;
}

/* Require the device has the PCI Power Management capability
 * and that a D3hot->D0 transition will results in a full
 * internal reset, not just a soft reset.
 */
621
static unsigned int
622
virPCIDeviceDetectPowerManagementReset(virPCIDevicePtr dev, int cfgfd)
623 624 625 626 627
{
    if (dev->pci_pm_cap_pos) {
        uint32_t ctl;

        /* require the NO_SOFT_RESET bit is clear */
628
        ctl = virPCIDeviceRead32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL);
629 630 631 632 633 634 635 636 637 638 639
        if (!(ctl & PCI_PM_CTRL_NO_SOFT_RESET)) {
            VIR_DEBUG("%s %s: detected PM reset capability", dev->id, dev->name);
            return 1;
        }
    }

    VIR_DEBUG("%s %s: no PM reset capability found", dev->id, dev->name);

    return 0;
}

640
/* Any active devices on the same domain/bus ? */
641
static int
642
virPCIDeviceSharesBusWithActive(virPCIDevicePtr dev, virPCIDevicePtr check, void *data)
643
{
644
    virPCIDeviceList *inactiveDevs = data;
645

646
    /* Different domain, different bus, or simply identical device */
647 648
    if (dev->domain != check->domain ||
        dev->bus != check->bus ||
649 650
        (dev->slot == check->slot &&
         dev->function == check->function))
651 652
        return 0;

653
    /* same bus, but inactive, i.e. about to be assigned to guest */
654
    if (inactiveDevs && virPCIDeviceListFind(inactiveDevs, check))
655
        return 0;
656

657
    return 1;
658 659
}

660 661 662
static virPCIDevicePtr
virPCIDeviceBusContainsActiveDevices(virPCIDevicePtr dev,
                                     virPCIDeviceList *inactiveDevs)
663
{
664 665 666
    virPCIDevicePtr active = NULL;
    if (virPCIDeviceIterDevices(virPCIDeviceSharesBusWithActive,
                                dev, &active, inactiveDevs) < 0)
667 668 669 670 671
        return NULL;
    return active;
}

/* Is @check the parent of @dev ? */
672
static int
673
virPCIDeviceIsParent(virPCIDevicePtr dev, virPCIDevicePtr check, void *data)
674 675 676
{
    uint16_t device_class;
    uint8_t header_type, secondary, subordinate;
677
    virPCIDevicePtr *best = data;
678 679
    int ret = 0;
    int fd;
680

681
    if (dev->domain != check->domain)
682 683
        return 0;

684
    if ((fd = virPCIDeviceConfigOpen(check, false)) < 0)
685 686
        return 0;

687
    /* Is it a bridge? */
688 689
    ret = virPCIDeviceReadClass(check, &device_class);
    if (ret < 0 || device_class != PCI_CLASS_BRIDGE_PCI)
690
        goto cleanup;
691 692

    /* Is it a plane? */
693
    header_type = virPCIDeviceRead8(check, fd, PCI_HEADER_TYPE);
694
    if ((header_type & PCI_HEADER_TYPE_MASK) != PCI_HEADER_TYPE_BRIDGE)
695
        goto cleanup;
696

697 698
    secondary   = virPCIDeviceRead8(check, fd, PCI_SECONDARY_BUS);
    subordinate = virPCIDeviceRead8(check, fd, PCI_SUBORDINATE_BUS);
699

700
    VIR_DEBUG("%s %s: found parent device %s", dev->id, dev->name, check->name);
701

702 703 704
    /* if the secondary bus exactly equals the device's bus, then we found
     * the direct parent.  No further work is necessary
     */
705 706 707 708
    if (dev->bus == secondary) {
        ret = 1;
        goto cleanup;
    }
709

710
    /* otherwise, SRIOV allows VFs to be on different buses than their PFs.
711 712 713 714 715
     * In this case, what we need to do is look for the "best" match; i.e.
     * the most restrictive match that still satisfies all of the conditions.
     */
    if (dev->bus > secondary && dev->bus <= subordinate) {
        if (*best == NULL) {
716 717
            *best = virPCIDeviceNew(check->domain, check->bus, check->slot,
                                    check->function);
718 719 720 721 722
            if (*best == NULL) {
                ret = -1;
                goto cleanup;
            }
        } else {
723 724 725 726
            /* OK, we had already recorded a previous "best" match for the
             * parent.  See if the current device is more restrictive than the
             * best, and if so, make it the new best
             */
727 728 729
            int bestfd;
            uint8_t best_secondary;

730
            if ((bestfd = virPCIDeviceConfigOpen(*best, false)) < 0)
731
                goto cleanup;
732 733
            best_secondary = virPCIDeviceRead8(*best, bestfd, PCI_SECONDARY_BUS);
            virPCIDeviceConfigClose(*best, bestfd);
734 735

            if (secondary > best_secondary) {
736 737 738
                virPCIDeviceFree(*best);
                *best = virPCIDeviceNew(check->domain, check->bus, check->slot,
                                        check->function);
739 740 741 742
                if (*best == NULL) {
                    ret = -1;
                    goto cleanup;
                }
743 744 745 746
            }
        }
    }

747
cleanup:
748
    virPCIDeviceConfigClose(check, fd);
749
    return ret;
750 751
}

752
static int
753
virPCIDeviceGetParent(virPCIDevicePtr dev, virPCIDevicePtr *parent)
754
{
755
    virPCIDevicePtr best = NULL;
756 757 758
    int ret;

    *parent = NULL;
759
    ret = virPCIDeviceIterDevices(virPCIDeviceIsParent, dev, parent, &best);
760
    if (ret == 1)
761
        virPCIDeviceFree(best);
762 763 764
    else if (ret == 0)
        *parent = best;
    return ret;
765 766 767 768 769 770
}

/* Secondary Bus Reset is our sledgehammer - it resets all
 * devices behind a bus.
 */
static int
771 772 773
virPCIDeviceTrySecondaryBusReset(virPCIDevicePtr dev,
                                 int cfgfd,
                                 virPCIDeviceList *inactiveDevs)
774
{
775
    virPCIDevicePtr parent, conflict;
776 777 778
    uint8_t config_space[PCI_CONF_LEN];
    uint16_t ctl;
    int ret = -1;
779
    int parentfd;
780

781 782 783
    /* Refuse to do a secondary bus reset if there are other
     * devices/functions behind the bus are used by the host
     * or other guests.
784
     */
785
    if ((conflict = virPCIDeviceBusContainsActiveDevices(dev, inactiveDevs))) {
786
        virReportError(VIR_ERR_INTERNAL_ERROR,
787 788
                       _("Active %s devices on bus with %s, not doing bus reset"),
                       conflict->name, dev->name);
789 790 791 792
        return -1;
    }

    /* Find the parent bus */
793
    if (virPCIDeviceGetParent(dev, &parent) < 0)
794
        return -1;
795
    if (!parent) {
796
        virReportError(VIR_ERR_INTERNAL_ERROR,
797 798
                       _("Failed to find parent device for %s"),
                       dev->name);
799 800
        return -1;
    }
801
    if ((parentfd = virPCIDeviceConfigOpen(parent, true)) < 0)
802
        goto out;
803 804 805 806 807 808 809

    VIR_DEBUG("%s %s: doing a secondary bus reset", dev->id, dev->name);

    /* Save and restore the device's config space; we only do this
     * for the supplied device since we refuse to do a reset if there
     * are multiple devices/functions
     */
810
    if (virPCIDeviceRead(dev, cfgfd, 0, config_space, PCI_CONF_LEN) < 0) {
811
        virReportError(VIR_ERR_INTERNAL_ERROR,
812
                       _("Failed to read PCI config space for %s"),
813
                       dev->name);
814 815 816 817 818 819
        goto out;
    }

    /* Read the control register, set the reset flag, wait 200ms,
     * unset the reset flag and wait 200ms.
     */
820
    ctl = virPCIDeviceRead16(dev, cfgfd, PCI_BRIDGE_CONTROL);
821

822 823
    virPCIDeviceWrite16(parent, parentfd, PCI_BRIDGE_CONTROL,
                        ctl | PCI_BRIDGE_CTL_RESET);
824 825 826

    usleep(200 * 1000); /* sleep 200ms */

827
    virPCIDeviceWrite16(parent, parentfd, PCI_BRIDGE_CONTROL, ctl);
828 829 830

    usleep(200 * 1000); /* sleep 200ms */

831
    if (virPCIDeviceWrite(dev, cfgfd, 0, config_space, PCI_CONF_LEN) < 0) {
832
        virReportError(VIR_ERR_INTERNAL_ERROR,
833 834 835 836
                       _("Failed to restore PCI config space for %s"),
                       dev->name);
        goto out;
    }
837
    ret = 0;
838

839
out:
840 841
    virPCIDeviceConfigClose(parent, parentfd);
    virPCIDeviceFree(parent);
842 843 844 845 846 847 848 849
    return ret;
}

/* Power management reset attempts to reset a device using a
 * D-state transition from D3hot to D0. Note, in detect_pm_reset()
 * above we require the device supports a full internal reset.
 */
static int
850
virPCIDeviceTryPowerManagementReset(virPCIDevicePtr dev, int cfgfd)
851 852 853 854 855 856 857 858
{
    uint8_t config_space[PCI_CONF_LEN];
    uint32_t ctl;

    if (!dev->pci_pm_cap_pos)
        return -1;

    /* Save and restore the device's config space. */
859
    if (virPCIDeviceRead(dev, cfgfd, 0, &config_space[0], PCI_CONF_LEN) < 0) {
860
        virReportError(VIR_ERR_INTERNAL_ERROR,
861
                       _("Failed to read PCI config space for %s"),
862
                       dev->name);
863 864 865 866 867
        return -1;
    }

    VIR_DEBUG("%s %s: doing a power management reset", dev->id, dev->name);

868
    ctl = virPCIDeviceRead32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL);
869 870
    ctl &= ~PCI_PM_CTRL_STATE_MASK;

871 872
    virPCIDeviceWrite32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL,
                        ctl | PCI_PM_CTRL_STATE_D3hot);
873 874 875

    usleep(10 * 1000); /* sleep 10ms */

876 877
    virPCIDeviceWrite32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL,
                        ctl | PCI_PM_CTRL_STATE_D0);
878 879 880

    usleep(10 * 1000); /* sleep 10ms */

881
    if (virPCIDeviceWrite(dev, cfgfd, 0, &config_space[0], PCI_CONF_LEN) < 0) {
882
        virReportError(VIR_ERR_INTERNAL_ERROR,
883 884 885 886
                       _("Failed to restore PCI config space for %s"),
                       dev->name);
        return -1;
    }
887 888 889 890 891

    return 0;
}

static int
892
virPCIDeviceInit(virPCIDevicePtr dev, int cfgfd)
893
{
894 895
    int flr;

896 897 898
    dev->pcie_cap_pos   = virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_EXP);
    dev->pci_pm_cap_pos = virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_PM);
    flr = virPCIDeviceDetectFunctionLevelReset(dev, cfgfd);
899
    if (flr < 0)
900
        return flr;
901 902
    dev->has_flr        = !!flr;
    dev->has_pm_reset   = !!virPCIDeviceDetectPowerManagementReset(dev, cfgfd);
903

904 905 906 907
    return 0;
}

int
908 909 910
virPCIDeviceReset(virPCIDevicePtr dev,
                  virPCIDeviceList *activeDevs,
                  virPCIDeviceList *inactiveDevs)
911
{
912 913
    char *drvPath = NULL;
    char *drvName = NULL;
914
    int ret = -1;
915
    int fd = -1;
916

917
    if (activeDevs && virPCIDeviceListFind(activeDevs, dev)) {
918
        virReportError(VIR_ERR_INTERNAL_ERROR,
919 920 921 922
                       _("Not resetting active device %s"), dev->name);
        return -1;
    }

923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938
    /* If the device is currently bound to vfio-pci, ignore all
     * requests to reset it, since the vfio-pci driver will always
     * reset it whenever appropriate, so doing it ourselves would just
     * be redundant.
     */
    if (virPCIDeviceGetDriverPathAndName(dev, &drvPath, &drvName) < 0)
        goto cleanup;

    if (STREQ_NULLABLE(drvName, "vfio-pci")) {
        VIR_DEBUG("Device %s is bound to vfio-pci - skip reset",
                  dev->name);
        ret = 0;
        goto cleanup;
    }
    VIR_DEBUG("Resetting device %s", dev->name);

939
    if ((fd = virPCIDeviceConfigOpen(dev, true)) < 0)
940
        goto cleanup;
941

942
    if (virPCIDeviceInit(dev, fd) < 0)
943 944
        goto cleanup;

945 946 947
    /* KVM will perform FLR when starting and stopping
     * a guest, so there is no need for us to do it here.
     */
948 949 950 951
    if (dev->has_flr) {
        ret = 0;
        goto cleanup;
    }
952

953 954 955 956 957
    /* If the device supports PCI power management reset,
     * that's the next best thing because it only resets
     * the function, not the whole device.
     */
    if (dev->has_pm_reset)
958
        ret = virPCIDeviceTryPowerManagementReset(dev, fd);
959

960
    /* Bus reset is not an option with the root bus */
961
    if (ret < 0 && dev->bus != 0)
962
        ret = virPCIDeviceTrySecondaryBusReset(dev, fd, inactiveDevs);
963

964 965
    if (ret < 0) {
        virErrorPtr err = virGetLastError();
966
        virReportError(VIR_ERR_INTERNAL_ERROR,
967 968
                       _("Unable to reset PCI device %s: %s"),
                       dev->name,
969 970
                       err ? err->message :
                       _("no FLR, PM reset or bus reset available"));
971 972
    }

973
cleanup:
974 975
    VIR_FREE(drvPath);
    VIR_FREE(drvName);
976
    virPCIDeviceConfigClose(dev, fd);
977 978 979
    return ret;
}

980

981
static int
982
virPCIProbeStubDriver(const char *driver)
983
{
984
    char *drvpath = NULL;
985
    bool probed = false;
986 987

recheck:
988
    if (virPCIDriverDir(&drvpath, driver) == 0 && virFileExists(drvpath)) {
989
        /* driver already loaded, return */
990
        VIR_FREE(drvpath);
991
        return 0;
992 993 994
    }

    VIR_FREE(drvpath);
995 996

    if (!probed) {
997
        char *errbuf = NULL;
998
        probed = true;
999 1000 1001 1002
        if ((errbuf = virKModLoad(driver, true))) {
            VIR_WARN("failed to load driver %s: %s", driver, errbuf);
            VIR_FREE(errbuf);
            goto cleanup;
1003
        }
1004 1005

        goto recheck;
1006 1007
    }

1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
cleanup:
    /* If we know failure was because of blacklist, let's report that;
     * otherwise, report a more generic failure message
     */
    if (virKModIsBlacklisted(driver)) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to load PCI stub module %s: "
                         "administratively prohibited"),
                       driver);
    } else {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to load PCI stub module %s"),
                       driver);
    }

1023
    return -1;
1024 1025
}

1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063
int
virPCIDeviceUnbind(virPCIDevicePtr dev, bool reprobe)
{
    char *path = NULL;
    char *drvpath = NULL;
    char *driver = NULL;
    int ret = -1;

    if (virPCIDeviceGetDriverPathAndName(dev, &drvpath, &driver) < 0)
        goto cleanup;

    if (!driver) {
        /* The device is not bound to any driver */
        ret = 0;
        goto cleanup;
    }

    if (virPCIFile(&path, dev->name, "driver/unbind") < 0)
        goto cleanup;

    if (virFileExists(path)) {
        if (virFileWriteStr(path, dev->name, 0) < 0) {
            virReportSystemError(errno,
                                 _("Failed to unbind PCI device '%s' from %s"),
                                 dev->name, driver);
            goto cleanup;
        }
        dev->reprobe = reprobe;
    }

    ret = 0;
cleanup:
    VIR_FREE(path);
    VIR_FREE(drvpath);
    VIR_FREE(driver);
    return ret;
}

1064 1065 1066 1067 1068 1069 1070
static const char *virPCIKnownStubs[] = {
    "pciback",  /* used by xen */
    "pci-stub", /* used by kvm legacy passthrough */
    "vfio-pci", /* used by VFIO device assignment */
    NULL
};

1071
static int
1072
virPCIDeviceUnbindFromStub(virPCIDevicePtr dev)
1073 1074 1075 1076
{
    int result = -1;
    char *drvdir = NULL;
    char *path = NULL;
1077 1078 1079
    char *driver = NULL;
    const char **stubTest;
    bool isStub = false;
1080

1081 1082 1083
    /* If the device is currently bound to one of the "well known"
     * stub drivers, then unbind it, otherwise ignore it.
     */
L
Laine Stump 已提交
1084
    if (virPCIDeviceGetDriverPathAndName(dev, &drvdir, &driver) < 0)
1085
        goto cleanup;
E
Eric Blake 已提交
1086

1087 1088 1089 1090 1091
    if (!driver) {
        /* The device is not bound to any driver and we are almost done. */
        goto reprobe;
    }

1092 1093 1094
    if (!dev->unbind_from_stub)
        goto remove_slot;

1095 1096 1097 1098 1099 1100 1101 1102 1103 1104
    /* If the device isn't bound to a known stub, skip the unbind. */
    for (stubTest = virPCIKnownStubs; *stubTest != NULL; stubTest++) {
        if (STREQ(driver, *stubTest)) {
            isStub = true;
            VIR_DEBUG("Found stub driver %s", *stubTest);
            break;
        }
    }
    if (!isStub)
        goto remove_slot;
1105

1106 1107
    if (virPCIDeviceUnbind(dev, dev->reprobe) < 0)
        goto cleanup;
1108
    dev->unbind_from_stub = false;
1109 1110 1111 1112

remove_slot:
    if (!dev->remove_slot)
        goto reprobe;
1113 1114

    /* Xen's pciback.ko wants you to use remove_slot on the specific device */
1115
    if (virPCIDriverFile(&path, driver, "remove_slot") < 0) {
1116 1117 1118 1119 1120
        goto cleanup;
    }

    if (virFileExists(path) && virFileWriteStr(path, dev->name, 0) < 0) {
        virReportSystemError(errno,
1121
                             _("Failed to remove slot for PCI device '%s' from %s"),
1122 1123 1124
                             dev->name, driver);
        goto cleanup;
    }
1125
    dev->remove_slot = false;
1126 1127 1128 1129 1130 1131

reprobe:
    if (!dev->reprobe) {
        result = 0;
        goto cleanup;
    }
1132 1133 1134 1135 1136 1137

    /* Trigger a re-probe of the device is not in the stub's dynamic
     * ID table. If the stub is available, but 'remove_id' isn't
     * available, then re-probing would just cause the device to be
     * re-bound to the stub.
     */
1138
    if (driver && virPCIDriverFile(&path, driver, "remove_id") < 0)
1139 1140
        goto cleanup;

1141
    if (!driver || !virFileExists(drvdir) || virFileExists(path)) {
1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152
        if (virFileWriteStr(PCI_SYSFS "drivers_probe", dev->name, 0) < 0) {
            virReportSystemError(errno,
                                 _("Failed to trigger a re-probe for PCI device '%s'"),
                                 dev->name);
            goto cleanup;
        }
    }

    result = 0;

cleanup:
1153
    /* do not do it again */
1154 1155 1156
    dev->unbind_from_stub = false;
    dev->remove_slot = false;
    dev->reprobe = false;
1157

1158 1159
    VIR_FREE(drvdir);
    VIR_FREE(path);
1160
    VIR_FREE(driver);
1161 1162 1163 1164

    return result;
}

1165 1166

static int
1167 1168
virPCIDeviceBindToStub(virPCIDevicePtr dev,
                       const char *stubDriverName)
1169
{
1170
    int result = -1;
1171
    int reprobe = false;
1172 1173 1174
    char *stubDriverPath = NULL;
    char *driverLink = NULL;
    char *path = NULL; /* reused for different purposes */
J
Jiri Denemark 已提交
1175 1176
    char *newDriverName = NULL;
    virErrorPtr err = NULL;
1177 1178

    if (virPCIDriverDir(&stubDriverPath, stubDriverName) < 0 ||
J
Jiri Denemark 已提交
1179 1180
        virPCIFile(&driverLink, dev->name, "driver") < 0 ||
        VIR_STRDUP(newDriverName, stubDriverName) < 0)
1181 1182
        goto cleanup;

1183 1184 1185 1186 1187
    if (virFileExists(driverLink)) {
        if (virFileLinkPointsTo(driverLink, stubDriverPath)) {
            /* The device is already bound to the correct driver */
            VIR_DEBUG("Device %s is already bound to %s",
                      dev->name, stubDriverName);
1188 1189 1190
            result = 0;
            goto cleanup;
        }
1191
        reprobe = true;
1192
    }
1193 1194 1195 1196 1197 1198 1199 1200 1201

    /* Add the PCI device ID to the stub's dynamic ID table;
     * this is needed to allow us to bind the device to the stub.
     * Note: if the device is not currently bound to any driver,
     * stub will immediately be bound to the device. Also, note
     * that if a new device with this ID is hotplugged, or if a probe
     * is triggered for such a device, it will also be immediately
     * bound by the stub.
     */
1202
    if (virPCIDriverFile(&path, stubDriverName, "new_id") < 0) {
1203
        goto cleanup;
1204 1205
    }

1206
    if (virFileWriteStr(path, dev->id, 0) < 0) {
1207
        virReportSystemError(errno,
1208
                             _("Failed to add PCI device ID '%s' to %s"),
1209
                             dev->id, stubDriverName);
1210
        goto cleanup;
1211 1212
    }

1213
    /* check whether the device is bound to pci-stub when we write dev->id to
1214
     * ${stubDriver}/new_id.
1215
     */
1216
    if (virFileLinkPointsTo(driverLink, stubDriverPath)) {
1217 1218
        dev->unbind_from_stub = true;
        dev->remove_slot = true;
J
Jiri Denemark 已提交
1219
        result = 0;
1220 1221 1222
        goto remove_id;
    }

1223
    if (virPCIDeviceUnbind(dev, reprobe) < 0)
J
Jiri Denemark 已提交
1224
        goto remove_id;
1225

1226 1227
    /* If the device isn't already bound to pci-stub, try binding it now.
     */
1228
    if (!virFileLinkPointsTo(driverLink, stubDriverPath)) {
1229
        /* Xen's pciback.ko wants you to use new_slot first */
1230
        if (virPCIDriverFile(&path, stubDriverName, "new_slot") < 0) {
1231
            goto remove_id;
1232 1233
        }

1234
        if (virFileExists(path) && virFileWriteStr(path, dev->name, 0) < 0) {
1235
            virReportSystemError(errno,
1236 1237 1238
                                 _("Failed to add slot for "
                                   "PCI device '%s' to %s"),
                                 dev->name, stubDriverName);
1239
            goto remove_id;
1240
        }
1241
        dev->remove_slot = true;
1242

1243
        if (virPCIDriverFile(&path, stubDriverName, "bind") < 0) {
1244
            goto remove_id;
1245 1246
        }

1247
        if (virFileWriteStr(path, dev->name, 0) < 0) {
1248
            virReportSystemError(errno,
1249
                                 _("Failed to bind PCI device '%s' to %s"),
1250
                                 dev->name, stubDriverName);
1251
            goto remove_id;
1252
        }
1253
        dev->unbind_from_stub = true;
1254 1255
    }

J
Jiri Denemark 已提交
1256 1257
    result = 0;

1258
remove_id:
J
Jiri Denemark 已提交
1259 1260
    err = virSaveLastError();

1261 1262 1263
    /* If 'remove_id' exists, remove the device id from pci-stub's dynamic
     * ID table so that 'drivers_probe' works below.
     */
1264
    if (virPCIDriverFile(&path, stubDriverName, "remove_id") < 0) {
E
Eric Blake 已提交
1265
        /* We do not remove PCI ID from pci-stub, and we cannot reprobe it */
1266 1267
        if (dev->reprobe) {
            VIR_WARN("Could not remove PCI ID '%s' from %s, and the device "
1268
                     "cannot be probed again.", dev->id, stubDriverName);
1269
        }
1270
        dev->reprobe = false;
J
Jiri Denemark 已提交
1271
        result = -1;
1272 1273 1274
        goto cleanup;
    }

1275
    if (virFileExists(path) && virFileWriteStr(path, dev->id, 0) < 0) {
1276
        virReportSystemError(errno,
1277
                             _("Failed to remove PCI ID '%s' from %s"),
1278
                             dev->id, stubDriverName);
1279

E
Eric Blake 已提交
1280
        /* remove PCI ID from pci-stub failed, and we cannot reprobe it */
1281 1282
        if (dev->reprobe) {
            VIR_WARN("Failed to remove PCI ID '%s' from %s, and the device "
1283
                     "cannot be probed again.", dev->id, stubDriverName);
1284
        }
1285
        dev->reprobe = false;
J
Jiri Denemark 已提交
1286
        result = -1;
1287
        goto cleanup;
1288 1289
    }

1290
cleanup:
1291 1292
    VIR_FREE(stubDriverPath);
    VIR_FREE(driverLink);
1293 1294
    VIR_FREE(path);

J
Jiri Denemark 已提交
1295 1296 1297 1298
    if (result < 0) {
        VIR_FREE(newDriverName);
        virPCIDeviceUnbindFromStub(dev);
    } else {
1299
        VIR_FREE(dev->stubDriver);
J
Jiri Denemark 已提交
1300
        dev->stubDriver = newDriverName;
1301
    }
J
Jiri Denemark 已提交
1302 1303 1304 1305

    if (err)
        virSetError(err);
    virFreeError(err);
1306

1307
    return result;
1308 1309
}

1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327
/* virPCIDeviceDetach:
 *
 * Detach this device from the host driver, attach it to the stub
 * driver (previously set with virPCIDeviceSetStubDriver(), and add *a
 * copy* of the object to the inactiveDevs list (if provided). This
 * function will *never* consume dev, so the caller should free it.
 *
 * Returns 0 on success, -1 on failure (will fail if the device is
 * already in the activeDevs list, but will be a NOP if the device is
 * already bound to the stub).
 *
 * GENERAL NOTE: activeDevs should be a list of all PCI devices
 * currently in use by a domain. inactiveDevs is a list of all PCI
 * devices that libvirt has detached from the host driver + attached
 * to the stub driver, but hasn't yet assigned to a domain. Any device
 * that is still attached to its host driver should not be on either
 * list.
 */
1328
int
1329 1330
virPCIDeviceDetach(virPCIDevicePtr dev,
                   virPCIDeviceList *activeDevs,
1331
                   virPCIDeviceList *inactiveDevs)
1332
{
J
John Ferlan 已提交
1333 1334
    sa_assert(dev->stubDriver);

1335
    if (virPCIProbeStubDriver(dev->stubDriver) < 0)
1336 1337
        return -1;

1338
    if (activeDevs && virPCIDeviceListFind(activeDevs, dev)) {
1339
        virReportError(VIR_ERR_INTERNAL_ERROR,
1340 1341 1342 1343
                       _("Not detaching active device %s"), dev->name);
        return -1;
    }

1344
    if (virPCIDeviceBindToStub(dev, dev->stubDriver) < 0)
1345 1346
        return -1;

1347 1348 1349
    /* Add *a copy of* the dev into list inactiveDevs, if
     * it's not already there.
     */
1350 1351 1352
    if (inactiveDevs && !virPCIDeviceListFind(inactiveDevs, dev) &&
        virPCIDeviceListAddCopy(inactiveDevs, dev) < 0) {
        return -1;
1353 1354 1355
    }

    return 0;
1356 1357 1358
}

int
1359 1360
virPCIDeviceReattach(virPCIDevicePtr dev,
                     virPCIDeviceListPtr activeDevs,
1361
                     virPCIDeviceListPtr inactiveDevs)
1362
{
1363
    if (activeDevs && virPCIDeviceListFind(activeDevs, dev)) {
1364
        virReportError(VIR_ERR_INTERNAL_ERROR,
1365 1366 1367 1368
                       _("Not reattaching active device %s"), dev->name);
        return -1;
    }

1369
    if (virPCIDeviceUnbindFromStub(dev) < 0)
1370 1371 1372 1373
        return -1;

    /* Steal the dev from list inactiveDevs */
    if (inactiveDevs)
1374
        virPCIDeviceListDel(inactiveDevs, dev);
1375 1376

    return 0;
1377 1378
}

1379 1380 1381 1382 1383
/* Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
 * the host when doing device passthrough.  This can lead to a race
 * condition where the hypervisor is still cleaning up the device while
 * libvirt is trying to re-attach it to the host device driver.  To avoid
 * this situation, we look through /proc/iomem, and if the hypervisor is
E
Eric Blake 已提交
1384 1385
 * still holding on to the bar (denoted by the string in the matcher
 * variable), then we can wait around a bit for that to clear up.
1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405
 *
 * A typical /proc/iomem looks like this (snipped for brevity):
 * 00010000-0008efff : System RAM
 * 0008f000-0008ffff : reserved
 * ...
 * 00100000-cc9fcfff : System RAM
 *   00200000-00483d3b : Kernel code
 *   00483d3c-005c88df : Kernel data
 * cc9fd000-ccc71fff : ACPI Non-volatile Storage
 * ...
 * d0200000-d02fffff : PCI Bus #05
 *   d0200000-d021ffff : 0000:05:00.0
 *     d0200000-d021ffff : e1000e
 *   d0220000-d023ffff : 0000:05:00.0
 *     d0220000-d023ffff : e1000e
 * ...
 * f0000000-f0003fff : 0000:00:1b.0
 *   f0000000-f0003fff : kvm_assigned_device
 *
 * Returns 0 if we are clear to continue, and 1 if the hypervisor is still
E
Eric Blake 已提交
1406
 * holding on to the resource.
1407 1408
 */
int
1409
virPCIDeviceWaitForCleanup(virPCIDevicePtr dev, const char *matcher)
1410 1411 1412
{
    FILE *fp;
    char line[160];
1413
    char *tmp;
1414
    unsigned long long start, end;
1415
    unsigned int domain, bus, slot, function;
1416
    bool in_matching_device;
1417 1418 1419 1420 1421 1422 1423 1424 1425
    int ret;
    size_t match_depth;

    fp = fopen("/proc/iomem", "r");
    if (!fp) {
        /* If we failed to open iomem, we just basically ignore the error.  The
         * unbind might succeed anyway, and besides, it's very likely we have
         * no way to report the error
         */
1426
        VIR_DEBUG("Failed to open /proc/iomem, trying to continue anyway");
1427 1428 1429 1430
        return 0;
    }

    ret = 0;
1431
    in_matching_device = false;
1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442
    match_depth = 0;
    while (fgets(line, sizeof(line), fp) != 0) {
        /* the logic here is a bit confusing.  For each line, we look to
         * see if it matches the domain:bus:slot.function we were given.
         * If this line matches the DBSF, then any subsequent lines indented
         * by 2 spaces are the PCI regions for this device.  It's also
         * possible that none of the PCI regions are currently mapped, in
         * which case we have no indented regions.  This code handles all
         * of these situations
         */
        if (in_matching_device && (strspn(line, " ") == (match_depth + 2))) {
1443 1444 1445 1446 1447 1448
            /* expected format: <start>-<end> : <suffix> */
            if (/* start */
                virStrToLong_ull(line, &tmp, 16, &start) < 0 || *tmp != '-' ||
                /* end */
                virStrToLong_ull(tmp + 1, &tmp, 16, &end) < 0 ||
                (tmp = STRSKIP(tmp, " : ")) == NULL)
1449 1450
                continue;

1451
            if (STRPREFIX(tmp, matcher)) {
1452 1453 1454 1455 1456
                ret = 1;
                break;
            }
        }
        else {
1457
            in_matching_device = false;
1458

1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472
            /* expected format: <start>-<end> : <domain>:<bus>:<slot>.<function> */
            if (/* start */
                virStrToLong_ull(line, &tmp, 16, &start) < 0 || *tmp != '-' ||
                /* end */
                virStrToLong_ull(tmp + 1, &tmp, 16, &end) < 0 ||
                (tmp = STRSKIP(tmp, " : ")) == NULL ||
                /* domain */
                virStrToLong_ui(tmp, &tmp, 16, &domain) < 0 || *tmp != ':' ||
                /* bus */
                virStrToLong_ui(tmp + 1, &tmp, 16, &bus) < 0 || *tmp != ':' ||
                /* slot */
                virStrToLong_ui(tmp + 1, &tmp, 16, &slot) < 0 || *tmp != '.' ||
                /* function */
                virStrToLong_ui(tmp + 1, &tmp, 16, &function) < 0 || *tmp != '\n')
1473 1474 1475 1476 1477
                continue;

            if (domain != dev->domain || bus != dev->bus || slot != dev->slot ||
                function != dev->function)
                continue;
1478
            in_matching_device = true;
1479 1480 1481 1482
            match_depth = strspn(line, " ");
        }
    }

E
Eric Blake 已提交
1483
    VIR_FORCE_FCLOSE(fp);
1484 1485 1486 1487

    return ret;
}

1488
static char *
1489
virPCIDeviceReadID(virPCIDevicePtr dev, const char *id_name)
1490
{
1491
    char *path = NULL;
1492 1493
    char *id_str;

1494
    if (virPCIFile(&path, dev->name, id_name) < 0) {
1495 1496
        return NULL;
    }
1497 1498

    /* ID string is '0xNNNN\n' ... i.e. 7 bytes */
1499 1500
    if (virFileReadAll(path, 7, &id_str) < 0) {
        VIR_FREE(path);
1501
        return NULL;
1502 1503 1504
    }

    VIR_FREE(path);
1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517

    /* Check for 0x suffix */
    if (id_str[0] != '0' || id_str[1] != 'x') {
        VIR_FREE(id_str);
        return NULL;
    }

    /* Chop off the newline; we know the string is 7 bytes */
    id_str[6] = '\0';

    return id_str;
}

1518
int
1519 1520 1521 1522
virPCIGetAddrString(unsigned int domain,
                    unsigned int bus,
                    unsigned int slot,
                    unsigned int function,
1523
                    char **pciConfigAddr)
1524
{
1525
    virPCIDevicePtr dev = NULL;
1526 1527
    int ret = -1;

1528
    dev = virPCIDeviceNew(domain, bus, slot, function);
1529
    if (dev != NULL) {
1530
        if (VIR_STRDUP(*pciConfigAddr, dev->name) < 0)
1531 1532 1533 1534 1535
            goto cleanup;
        ret = 0;
    }

cleanup:
1536
    virPCIDeviceFree(dev);
1537 1538 1539
    return ret;
}

1540
virPCIDevicePtr
1541 1542 1543 1544
virPCIDeviceNew(unsigned int domain,
                unsigned int bus,
                unsigned int slot,
                unsigned int function)
1545
{
1546
    virPCIDevicePtr dev;
E
Eric Blake 已提交
1547 1548
    char *vendor = NULL;
    char *product = NULL;
1549

1550
    if (VIR_ALLOC(dev) < 0)
1551 1552 1553 1554 1555 1556 1557
        return NULL;

    dev->domain   = domain;
    dev->bus      = bus;
    dev->slot     = slot;
    dev->function = function;

E
Eric Blake 已提交
1558 1559 1560
    if (snprintf(dev->name, sizeof(dev->name), "%.4x:%.2x:%.2x.%.1x",
                 dev->domain, dev->bus, dev->slot,
                 dev->function) >= sizeof(dev->name)) {
1561
        virReportError(VIR_ERR_INTERNAL_ERROR,
E
Eric Blake 已提交
1562 1563
                       _("dev->name buffer overflow: %.4x:%.2x:%.2x.%.1x"),
                       dev->domain, dev->bus, dev->slot, dev->function);
E
Eric Blake 已提交
1564
        goto error;
E
Eric Blake 已提交
1565 1566
    }
    if (virAsprintf(&dev->path, PCI_SYSFS "devices/%s/config",
1567
                    dev->name) < 0)
E
Eric Blake 已提交
1568
        goto error;
1569

1570
    if (!virFileExists(dev->path)) {
1571 1572 1573
        virReportSystemError(errno,
                             _("Device %s not found: could not access %s"),
                             dev->name, dev->path);
E
Eric Blake 已提交
1574
        goto error;
1575 1576
    }

1577 1578
    vendor  = virPCIDeviceReadID(dev, "vendor");
    product = virPCIDeviceReadID(dev, "device");
1579 1580

    if (!vendor || !product) {
1581
        virReportError(VIR_ERR_INTERNAL_ERROR,
1582 1583
                       _("Failed to read product/vendor ID for %s"),
                       dev->name);
E
Eric Blake 已提交
1584
        goto error;
1585 1586 1587
    }

    /* strings contain '0x' prefix */
E
Eric Blake 已提交
1588 1589
    if (snprintf(dev->id, sizeof(dev->id), "%s %s", &vendor[2],
                 &product[2]) >= sizeof(dev->id)) {
1590
        virReportError(VIR_ERR_INTERNAL_ERROR,
E
Eric Blake 已提交
1591 1592
                       _("dev->id buffer overflow: %s %s"),
                       &vendor[2], &product[2]);
E
Eric Blake 已提交
1593
        goto error;
E
Eric Blake 已提交
1594
    }
1595 1596 1597

    VIR_DEBUG("%s %s: initialized", dev->id, dev->name);

E
Eric Blake 已提交
1598 1599 1600
cleanup:
    VIR_FREE(product);
    VIR_FREE(vendor);
1601
    return dev;
E
Eric Blake 已提交
1602 1603

error:
1604
    virPCIDeviceFree(dev);
E
Eric Blake 已提交
1605 1606
    dev = NULL;
    goto cleanup;
1607 1608
}

L
Laine Stump 已提交
1609 1610 1611 1612 1613 1614

virPCIDevicePtr
virPCIDeviceCopy(virPCIDevicePtr dev)
{
    virPCIDevicePtr copy;

1615
    if (VIR_ALLOC(copy) < 0)
L
Laine Stump 已提交
1616 1617 1618 1619 1620
        return NULL;

    /* shallow copy to take care of most attributes */
    *copy = *dev;
    copy->path = copy->stubDriver = NULL;
C
Chunyan Liu 已提交
1621
    copy->used_by_drvname = copy->used_by_domname = NULL;
L
Laine Stump 已提交
1622
    if (VIR_STRDUP(copy->path, dev->path) < 0 ||
C
Chunyan Liu 已提交
1623 1624 1625
        VIR_STRDUP(copy->stubDriver, dev->stubDriver) < 0 ||
        VIR_STRDUP(copy->used_by_drvname, dev->used_by_drvname) < 0 ||
        VIR_STRDUP(copy->used_by_domname, dev->used_by_domname) < 0) {
L
Laine Stump 已提交
1626 1627 1628 1629 1630 1631 1632 1633 1634 1635
        goto error;
    }
    return copy;

error:
    virPCIDeviceFree(copy);
    return NULL;
}


1636
void
1637
virPCIDeviceFree(virPCIDevicePtr dev)
1638
{
1639 1640
    if (!dev)
        return;
1641
    VIR_DEBUG("%s %s: freeing", dev->id, dev->name);
E
Eric Blake 已提交
1642
    VIR_FREE(dev->path);
1643
    VIR_FREE(dev->stubDriver);
C
Chunyan Liu 已提交
1644 1645
    VIR_FREE(dev->used_by_drvname);
    VIR_FREE(dev->used_by_domname);
1646 1647
    VIR_FREE(dev);
}
1648

1649
const char *
1650
virPCIDeviceGetName(virPCIDevicePtr dev)
1651 1652 1653 1654
{
    return dev->name;
}

1655
void virPCIDeviceSetManaged(virPCIDevicePtr dev, bool managed)
1656
{
1657
    dev->managed = managed;
1658 1659
}

1660 1661
unsigned int
virPCIDeviceGetManaged(virPCIDevicePtr dev)
1662 1663 1664 1665
{
    return dev->managed;
}

1666
int
1667 1668
virPCIDeviceSetStubDriver(virPCIDevicePtr dev, const char *driver)
{
1669
    VIR_FREE(dev->stubDriver);
J
John Ferlan 已提交
1670
    return VIR_STRDUP(dev->stubDriver, driver);
1671 1672 1673 1674 1675 1676 1677 1678
}

const char *
virPCIDeviceGetStubDriver(virPCIDevicePtr dev)
{
    return dev->stubDriver;
}

1679
unsigned int
1680
virPCIDeviceGetUnbindFromStub(virPCIDevicePtr dev)
1681 1682 1683 1684 1685
{
    return dev->unbind_from_stub;
}

void
1686
virPCIDeviceSetUnbindFromStub(virPCIDevicePtr dev, bool unbind)
1687
{
1688
    dev->unbind_from_stub = unbind;
1689 1690
}

1691
unsigned int
1692
virPCIDeviceGetRemoveSlot(virPCIDevicePtr dev)
1693 1694 1695 1696 1697
{
    return dev->remove_slot;
}

void
1698
virPCIDeviceSetRemoveSlot(virPCIDevicePtr dev, bool remove_slot)
1699
{
1700
    dev->remove_slot = remove_slot;
1701 1702
}

1703
unsigned int
1704
virPCIDeviceGetReprobe(virPCIDevicePtr dev)
1705 1706 1707 1708 1709
{
    return dev->reprobe;
}

void
1710
virPCIDeviceSetReprobe(virPCIDevicePtr dev, bool reprobe)
1711
{
1712
    dev->reprobe = reprobe;
1713 1714
}

C
Chunyan Liu 已提交
1715 1716 1717 1718
int
virPCIDeviceSetUsedBy(virPCIDevicePtr dev,
                      const char *drv_name,
                      const char *dom_name)
1719
{
C
Chunyan Liu 已提交
1720 1721 1722 1723 1724 1725 1726 1727
    VIR_FREE(dev->used_by_drvname);
    VIR_FREE(dev->used_by_domname);
    if (VIR_STRDUP(dev->used_by_drvname, drv_name) < 0)
        return -1;
    if (VIR_STRDUP(dev->used_by_domname, dom_name) < 0)
        return -1;

    return 0;
1728 1729
}

C
Chunyan Liu 已提交
1730 1731 1732 1733
void
virPCIDeviceGetUsedBy(virPCIDevicePtr dev,
                      const char **drv_name,
                      const char **dom_name)
1734
{
C
Chunyan Liu 已提交
1735 1736
    *drv_name = dev->used_by_drvname;
    *dom_name = dev->used_by_domname;
1737 1738
}

1739
void virPCIDeviceReattachInit(virPCIDevicePtr pci)
1740
{
1741 1742 1743
    pci->unbind_from_stub = true;
    pci->remove_slot = true;
    pci->reprobe = true;
1744 1745 1746
}


1747 1748
virPCIDeviceListPtr
virPCIDeviceListNew(void)
1749
{
1750
    virPCIDeviceListPtr list;
1751

1752 1753 1754 1755
    if (virPCIInitialize() < 0)
        return NULL;

    if (!(list = virObjectLockableNew(virPCIDeviceListClass)))
1756 1757 1758 1759 1760
        return NULL;

    return list;
}

1761 1762
static void
virPCIDeviceListDispose(void *obj)
1763
{
1764
    virPCIDeviceListPtr list = obj;
1765
    size_t i;
1766 1767

    for (i = 0; i < list->count; i++) {
1768
        virPCIDeviceFree(list->devs[i]);
1769 1770 1771 1772 1773 1774 1775 1776
        list->devs[i] = NULL;
    }

    list->count = 0;
    VIR_FREE(list->devs);
}

int
1777 1778
virPCIDeviceListAdd(virPCIDeviceListPtr list,
                    virPCIDevicePtr dev)
1779
{
1780
    if (virPCIDeviceListFind(list, dev)) {
1781
        virReportError(VIR_ERR_INTERNAL_ERROR,
1782 1783 1784
                       _("Device %s is already in use"), dev->name);
        return -1;
    }
1785
    return VIR_APPEND_ELEMENT(list->devs, list->count, dev);
1786 1787
}

L
Laine Stump 已提交
1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804

/* virPCIDeviceListAddCopy - add a *copy* of the device to this list */
int
virPCIDeviceListAddCopy(virPCIDeviceListPtr list, virPCIDevicePtr dev)
{
    virPCIDevicePtr copy = virPCIDeviceCopy(dev);

    if (!copy)
        return -1;
    if (virPCIDeviceListAdd(list, copy) < 0) {
        virPCIDeviceFree(copy);
        return -1;
    }
    return 0;
}


1805 1806 1807
virPCIDevicePtr
virPCIDeviceListGet(virPCIDeviceListPtr list,
                    int idx)
1808 1809 1810 1811 1812 1813 1814 1815 1816
{
    if (idx >= list->count)
        return NULL;
    if (idx < 0)
        return NULL;

    return list->devs[idx];
}

1817
size_t
1818
virPCIDeviceListCount(virPCIDeviceListPtr list)
1819
{
1820 1821 1822
    return list->count;
}

1823 1824 1825
virPCIDevicePtr
virPCIDeviceListStealIndex(virPCIDeviceListPtr list,
                           int idx)
1826
{
1827
    virPCIDevicePtr ret;
1828

1829 1830
    if (idx < 0 || idx >= list->count)
        return NULL;
1831

1832
    ret = list->devs[idx];
1833
    VIR_DELETE_ELEMENT(list->devs, idx, list->count);
1834 1835 1836
    return ret;
}

1837 1838 1839
virPCIDevicePtr
virPCIDeviceListSteal(virPCIDeviceListPtr list,
                      virPCIDevicePtr dev)
1840
{
1841
    return virPCIDeviceListStealIndex(list, virPCIDeviceListFindIndex(list, dev));
1842 1843
}

1844
void
1845 1846
virPCIDeviceListDel(virPCIDeviceListPtr list,
                    virPCIDevicePtr dev)
1847
{
1848
    virPCIDevicePtr ret = virPCIDeviceListSteal(list, dev);
1849
    virPCIDeviceFree(ret);
1850 1851
}

1852
int
1853
virPCIDeviceListFindIndex(virPCIDeviceListPtr list, virPCIDevicePtr dev)
1854
{
1855
    size_t i;
1856 1857 1858 1859 1860 1861

    for (i = 0; i < list->count; i++)
        if (list->devs[i]->domain   == dev->domain &&
            list->devs[i]->bus      == dev->bus    &&
            list->devs[i]->slot     == dev->slot   &&
            list->devs[i]->function == dev->function)
1862 1863 1864 1865
            return i;
    return -1;
}

L
Laine Stump 已提交
1866 1867 1868 1869 1870 1871 1872 1873

virPCIDevicePtr
virPCIDeviceListFindByIDs(virPCIDeviceListPtr list,
                          unsigned int domain,
                          unsigned int bus,
                          unsigned int slot,
                          unsigned int function)
{
1874
    size_t i;
L
Laine Stump 已提交
1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886

    for (i = 0; i < list->count; i++) {
        if (list->devs[i]->domain == domain &&
            list->devs[i]->bus == bus &&
            list->devs[i]->slot == slot &&
            list->devs[i]->function == function)
            return list->devs[i];
    }
    return NULL;
}


1887 1888
virPCIDevicePtr
virPCIDeviceListFind(virPCIDeviceListPtr list, virPCIDevicePtr dev)
1889
{
1890
    int idx;
1891

1892 1893
    if ((idx = virPCIDeviceListFindIndex(list, dev)) >= 0)
        return list->devs[idx];
1894 1895
    else
        return NULL;
1896
}
1897 1898


1899 1900 1901
int virPCIDeviceFileIterate(virPCIDevicePtr dev,
                            virPCIDeviceFileActor actor,
                            void *opaque)
1902 1903 1904 1905 1906 1907 1908 1909
{
    char *pcidir = NULL;
    char *file = NULL;
    DIR *dir = NULL;
    int ret = -1;
    struct dirent *ent;

    if (virAsprintf(&pcidir, "/sys/bus/pci/devices/%04x:%02x:%02x.%x",
1910
                    dev->domain, dev->bus, dev->slot, dev->function) < 0)
1911 1912 1913
        goto cleanup;

    if (!(dir = opendir(pcidir))) {
1914
        virReportSystemError(errno,
1915 1916 1917 1918 1919 1920
                             _("cannot open %s"), pcidir);
        goto cleanup;
    }

    while ((ent = readdir(dir)) != NULL) {
        /* Device assignment requires:
A
Alex Williamson 已提交
1921 1922
         *   $PCIDIR/config, $PCIDIR/resource, $PCIDIR/resourceNNN,
         *   $PCIDIR/rom, $PCIDIR/reset
1923 1924 1925
         */
        if (STREQ(ent->d_name, "config") ||
            STRPREFIX(ent->d_name, "resource") ||
A
Alex Williamson 已提交
1926 1927
            STREQ(ent->d_name, "rom") ||
            STREQ(ent->d_name, "reset")) {
1928
            if (virAsprintf(&file, "%s/%s", pcidir, ent->d_name) < 0)
1929
                goto cleanup;
1930
            if ((actor)(dev, file, opaque) < 0)
1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945
                goto cleanup;

            VIR_FREE(file);
        }
    }

    ret = 0;

cleanup:
    if (dir)
        closedir(dir);
    VIR_FREE(file);
    VIR_FREE(pcidir);
    return ret;
}
J
Jiri Denemark 已提交
1946

L
Laine Stump 已提交
1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964

/* virPCIDeviceAddressIOMMUGroupIterate:
 *   Call @actor for all devices in the same iommu_group as orig
 *   (including orig itself) Even if there is no iommu_group for the
 *   device, call @actor once for orig.
 */
int
virPCIDeviceAddressIOMMUGroupIterate(virPCIDeviceAddressPtr orig,
                                     virPCIDeviceAddressActor actor,
                                     void *opaque)
{
    char *groupPath = NULL;
    DIR *groupDir = NULL;
    int ret = -1;
    struct dirent *ent;

    if (virAsprintf(&groupPath,
                    PCI_SYSFS "devices/%04x:%02x:%02x.%x/iommu_group/devices",
1965
                    orig->domain, orig->bus, orig->slot, orig->function) < 0)
L
Laine Stump 已提交
1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076
        goto cleanup;

    if (!(groupDir = opendir(groupPath))) {
        /* just process the original device, nothing more */
        ret = (actor)(orig, opaque);
        goto cleanup;
    }

    while ((errno = 0, ent = readdir(groupDir)) != NULL) {
        virPCIDeviceAddress newDev;

        if (ent->d_name[0] == '.')
            continue;

        if (virPCIDeviceAddressParse(ent->d_name, &newDev) < 0) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Found invalid device link '%s' in '%s'"),
                           ent->d_name, groupPath);
            goto cleanup;
        }

        if ((actor)(&newDev, opaque) < 0)
            goto cleanup;
    }
    if (errno != 0) {
        virReportSystemError(errno,
                             _("Failed to read directory entry for %s"),
                             groupPath);
        goto cleanup;
    }

    ret = 0;

cleanup:
    VIR_FREE(groupPath);
    if (groupDir)
        closedir(groupDir);
    return ret;
}


static int
virPCIDeviceGetIOMMUGroupAddOne(virPCIDeviceAddressPtr newDevAddr, void *opaque)
{
    int ret = -1;
    virPCIDeviceListPtr groupList = opaque;
    virPCIDevicePtr newDev;

    if (!(newDev = virPCIDeviceNew(newDevAddr->domain, newDevAddr->bus,
                                   newDevAddr->slot, newDevAddr->function)))
        goto cleanup;

    if (virPCIDeviceListAdd(groupList, newDev) < 0)
        goto cleanup;

    newDev = NULL; /* it's now on the list */
    ret = 0;
cleanup:
    virPCIDeviceFree(newDev);
    return ret;
}


/*
 * virPCIDeviceGetIOMMUGroupList - return a virPCIDeviceList containing
 * all of the devices in the same iommu_group as @dev.
 *
 * Return the new list, or NULL on failure
 */
virPCIDeviceListPtr
virPCIDeviceGetIOMMUGroupList(virPCIDevicePtr dev)
{
    virPCIDeviceListPtr groupList = virPCIDeviceListNew();
    virPCIDeviceAddress devAddr = { dev->domain, dev->bus,
                                    dev->slot, dev->function };

    if (!groupList)
        goto error;

    if (virPCIDeviceAddressIOMMUGroupIterate(&devAddr,
                                             virPCIDeviceGetIOMMUGroupAddOne,
                                             groupList) < 0)
        goto error;

    return groupList;

error:
    virObjectUnref(groupList);
    return NULL;
}


typedef struct {
    virPCIDeviceAddressPtr **iommuGroupDevices;
    size_t *nIommuGroupDevices;
} virPCIDeviceAddressList;
typedef virPCIDeviceAddressList *virPCIDeviceAddressListPtr;

static int
virPCIGetIOMMUGroupAddressesAddOne(virPCIDeviceAddressPtr newDevAddr, void *opaque)
{
    int ret = -1;
    virPCIDeviceAddressListPtr addrList = opaque;
    virPCIDeviceAddressPtr copyAddr;

    /* make a copy to insert onto the list */
    if (VIR_ALLOC(copyAddr) < 0)
        goto cleanup;

    *copyAddr = *newDevAddr;

2077 2078
    if (VIR_APPEND_ELEMENT(*addrList->iommuGroupDevices,
                           *addrList->nIommuGroupDevices, copyAddr) < 0)
L
Laine Stump 已提交
2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129
        goto cleanup;

    ret = 0;
cleanup:
    VIR_FREE(copyAddr);
    return ret;
}


/*
 * virPCIDeviceAddressGetIOMMUGroupAddresses - return a
 * virPCIDeviceList containing all of the devices in the same
 * iommu_group as @dev.
 *
 * Return the new list, or NULL on failure
 */
int
virPCIDeviceAddressGetIOMMUGroupAddresses(virPCIDeviceAddressPtr devAddr,
                                          virPCIDeviceAddressPtr **iommuGroupDevices,
                                          size_t *nIommuGroupDevices)
{
    int ret = -1;
    virPCIDeviceAddressList addrList = { iommuGroupDevices,
                                         nIommuGroupDevices };

    if (virPCIDeviceAddressIOMMUGroupIterate(devAddr,
                                             virPCIGetIOMMUGroupAddressesAddOne,
                                             &addrList) < 0)
        goto cleanup;

    ret = 0;
cleanup:
    return ret;
}


/* virPCIDeviceAddressGetIOMMUGroupNum - return the group number of
 * this PCI device's iommu_group, or -2 if there is no iommu_group for
 * the device (or -1 if there was any other error)
 */
int
virPCIDeviceAddressGetIOMMUGroupNum(virPCIDeviceAddressPtr addr)
{
    char *devName = NULL;
    char *devPath = NULL;
    char *groupPath = NULL;
    const char *groupNumStr;
    unsigned int groupNum;
    int ret = -1;

    if (virAsprintf(&devName, "%.4x:%.2x:%.2x.%.1x", addr->domain,
2130
                    addr->bus, addr->slot, addr->function) < 0)
L
Laine Stump 已提交
2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164
        goto cleanup;

    if (virPCIFile(&devPath, devName, "iommu_group") < 0)
        goto cleanup;
    if (virFileIsLink(devPath) != 1) {
        ret = -2;
        goto cleanup;
    }
    if (virFileResolveLink(devPath, &groupPath) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s iommu_group symlink %s"),
                       devName, devPath);
        goto cleanup;
    }

    groupNumStr = last_component(groupPath);
    if (virStrToLong_ui(groupNumStr, NULL, 10, &groupNum) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("device %s iommu_group symlink %s has "
                         "invalid group number %s"),
                       devName, groupPath, groupNumStr);
        ret = -1;
        goto cleanup;
    }

    ret = groupNum;
cleanup:
    VIR_FREE(devName);
    VIR_FREE(devPath);
    VIR_FREE(groupPath);
    return ret;
}


2165 2166
/* virPCIDeviceGetIOMMUGroupDev - return the name of the device used
 * to control this PCI device's group (e.g. "/dev/vfio/15")
2167 2168
 */
char *
2169
virPCIDeviceGetIOMMUGroupDev(virPCIDevicePtr dev)
2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189
{
    char *devPath = NULL;
    char *groupPath = NULL;
    char *groupDev = NULL;

    if (virPCIFile(&devPath, dev->name, "iommu_group") < 0)
        goto cleanup;
    if (virFileIsLink(devPath) != 1) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Invalid device %s iommu_group file %s is not a symlink"),
                       dev->name, devPath);
        goto cleanup;
    }
    if (virFileResolveLink(devPath, &groupPath) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s iommu_group symlink %s"),
                       dev->name, devPath);
        goto cleanup;
    }
    if (virAsprintf(&groupDev, "/dev/vfio/%s",
2190
                    last_component(groupPath)) < 0)
2191 2192 2193 2194 2195 2196 2197
        goto cleanup;
cleanup:
    VIR_FREE(devPath);
    VIR_FREE(groupPath);
    return groupDev;
}

J
Jiri Denemark 已提交
2198
static int
2199
virPCIDeviceDownstreamLacksACS(virPCIDevicePtr dev)
J
Jiri Denemark 已提交
2200 2201 2202 2203
{
    uint16_t flags;
    uint16_t ctrl;
    unsigned int pos;
2204 2205
    int fd;
    int ret = 0;
2206
    uint16_t device_class;
J
Jiri Denemark 已提交
2207

2208
    if ((fd = virPCIDeviceConfigOpen(dev, true)) < 0)
J
Jiri Denemark 已提交
2209 2210
        return -1;

2211
    if (virPCIDeviceInit(dev, fd) < 0) {
2212 2213 2214 2215
        ret = -1;
        goto cleanup;
    }

2216 2217 2218
    if (virPCIDeviceReadClass(dev, &device_class) < 0)
        goto cleanup;

J
Jiri Denemark 已提交
2219
    pos = dev->pcie_cap_pos;
2220
    if (!pos || device_class != PCI_CLASS_BRIDGE_PCI)
2221
        goto cleanup;
J
Jiri Denemark 已提交
2222

2223
    flags = virPCIDeviceRead16(dev, fd, pos + PCI_EXP_FLAGS);
J
Jiri Denemark 已提交
2224
    if (((flags & PCI_EXP_FLAGS_TYPE) >> 4) != PCI_EXP_TYPE_DOWNSTREAM)
2225
        goto cleanup;
J
Jiri Denemark 已提交
2226

2227
    pos = virPCIDeviceFindExtendedCapabilityOffset(dev, fd, PCI_EXT_CAP_ID_ACS);
J
Jiri Denemark 已提交
2228 2229
    if (!pos) {
        VIR_DEBUG("%s %s: downstream port lacks ACS", dev->id, dev->name);
2230 2231
        ret = 1;
        goto cleanup;
J
Jiri Denemark 已提交
2232 2233
    }

2234
    ctrl = virPCIDeviceRead16(dev, fd, pos + PCI_EXT_ACS_CTRL);
J
Jiri Denemark 已提交
2235 2236 2237
    if ((ctrl & PCI_EXT_CAP_ACS_ENABLED) != PCI_EXT_CAP_ACS_ENABLED) {
        VIR_DEBUG("%s %s: downstream port has ACS disabled",
                  dev->id, dev->name);
2238 2239
        ret = 1;
        goto cleanup;
J
Jiri Denemark 已提交
2240 2241
    }

2242
cleanup:
2243
    virPCIDeviceConfigClose(dev, fd);
2244
    return ret;
J
Jiri Denemark 已提交
2245 2246 2247
}

static int
2248
virPCIDeviceIsBehindSwitchLackingACS(virPCIDevicePtr dev)
J
Jiri Denemark 已提交
2249
{
2250
    virPCIDevicePtr parent;
J
Jiri Denemark 已提交
2251

2252
    if (virPCIDeviceGetParent(dev, &parent) < 0)
2253
        return -1;
2254 2255 2256 2257 2258 2259 2260 2261
    if (!parent) {
        /* if we have no parent, and this is the root bus, ACS doesn't come
         * into play since devices on the root bus can't P2P without going
         * through the root IOMMU.
         */
        if (dev->bus == 0)
            return 0;
        else {
2262
            virReportError(VIR_ERR_INTERNAL_ERROR,
2263 2264 2265 2266
                           _("Failed to find parent device for %s"),
                           dev->name);
            return -1;
        }
J
Jiri Denemark 已提交
2267 2268 2269 2270 2271 2272 2273
    }

    /* XXX we should rather fail when we can't find device's parent and
     * stop the loop when we get to root instead of just stopping when no
     * parent can be found
     */
    do {
2274
        virPCIDevicePtr tmp;
J
Jiri Denemark 已提交
2275
        int acs;
2276
        int ret;
J
Jiri Denemark 已提交
2277

2278
        acs = virPCIDeviceDownstreamLacksACS(parent);
J
Jiri Denemark 已提交
2279 2280

        if (acs) {
2281
            virPCIDeviceFree(parent);
J
Jiri Denemark 已提交
2282 2283 2284 2285 2286 2287 2288
            if (acs < 0)
                return -1;
            else
                return 1;
        }

        tmp = parent;
2289 2290
        ret = virPCIDeviceGetParent(parent, &parent);
        virPCIDeviceFree(tmp);
2291 2292
        if (ret < 0)
            return -1;
J
Jiri Denemark 已提交
2293 2294 2295 2296 2297
    } while (parent);

    return 0;
}

2298 2299
int virPCIDeviceIsAssignable(virPCIDevicePtr dev,
                             int strict_acs_check)
J
Jiri Denemark 已提交
2300 2301 2302 2303 2304 2305 2306 2307
{
    int ret;

    /* XXX This could be a great place to actually check that a non-managed
     * device isn't in use, e.g. by checking that device is either un-bound
     * or bound to a stub driver.
     */

2308
    ret = virPCIDeviceIsBehindSwitchLackingACS(dev);
J
Jiri Denemark 已提交
2309 2310 2311 2312 2313 2314 2315 2316
    if (ret < 0)
        return 0;

    if (ret) {
        if (!strict_acs_check) {
            VIR_DEBUG("%s %s: strict ACS check disabled; device assignment allowed",
                      dev->id, dev->name);
        } else {
2317
            virReportError(VIR_ERR_INTERNAL_ERROR,
J
Jiri Denemark 已提交
2318 2319 2320 2321 2322 2323 2324 2325 2326
                           _("Device %s is behind a switch lacking ACS and "
                             "cannot be assigned"),
                           dev->name);
            return 0;
        }
    }

    return 1;
}
2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345

static int
logStrToLong_ui(char const *s,
                char **end_ptr,
                int base,
                unsigned int *result)
{
    int ret = 0;

    ret = virStrToLong_ui(s, end_ptr, base, result);
    if (ret != 0) {
        VIR_ERROR(_("Failed to convert '%s' to unsigned int"), s);
    } else {
        VIR_DEBUG("Converted '%s' to unsigned int %u", s, *result);
    }

    return ret;
}

2346 2347
int
virPCIDeviceAddressParse(char *address,
2348
                         virPCIDeviceAddressPtr bdf)
2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378
{
    char *p = NULL;
    int ret = -1;

    if ((address == NULL) || (logStrToLong_ui(address, &p, 16,
                                              &bdf->domain) == -1)) {
        goto out;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->bus) == -1)) {
        goto out;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->slot) == -1)) {
        goto out;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->function) == -1)) {
        goto out;
    }

    ret = 0;

out:
    return ret;
}

2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393
#ifdef __linux__

/*
 * returns true if equal
 */
static bool
virPCIDeviceAddressIsEqual(virPCIDeviceAddressPtr bdf1,
                           virPCIDeviceAddressPtr bdf2)
{
    return ((bdf1->domain == bdf2->domain) &&
            (bdf1->bus == bdf2->bus) &&
            (bdf1->slot == bdf2->slot) &&
            (bdf1->function == bdf2->function));
}

2394
static int
2395 2396
virPCIGetDeviceAddressFromSysfsLink(const char *device_link,
                                    virPCIDeviceAddressPtr *bdf)
2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410
{
    char *config_address = NULL;
    char *device_path = NULL;
    char errbuf[64];
    int ret = -1;

    VIR_DEBUG("Attempting to resolve device path from device link '%s'",
              device_link);

    if (!virFileExists(device_link)) {
        VIR_DEBUG("sysfs_path '%s' does not exist", device_link);
        return ret;
    }

2411
    device_path = canonicalize_file_name(device_link);
2412 2413
    if (device_path == NULL) {
        memset(errbuf, '\0', sizeof(errbuf));
2414 2415 2416
        virReportSystemError(errno,
                             _("Failed to resolve device link '%s'"),
                             device_link);
2417 2418 2419
        return ret;
    }

2420
    config_address = last_component(device_path);
2421
    if (VIR_ALLOC(*bdf) != 0)
2422 2423
        goto out;

2424
    if (virPCIDeviceAddressParse(config_address, *bdf) != 0) {
2425
        virReportError(VIR_ERR_INTERNAL_ERROR,
2426 2427 2428 2429 2430 2431
                       _("Failed to parse PCI config address '%s'"),
                       config_address);
        VIR_FREE(*bdf);
        goto out;
    }

2432
    VIR_DEBUG("virPCIDeviceAddress %.4x:%.2x:%.2x.%.1x",
2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449
              (*bdf)->domain,
              (*bdf)->bus,
              (*bdf)->slot,
              (*bdf)->function);

    ret = 0;

out:
    VIR_FREE(device_path);

    return ret;
}

/*
 * Returns Physical function given a virtual function
 */
int
2450 2451
virPCIGetPhysicalFunction(const char *vf_sysfs_path,
                          virPCIDeviceAddressPtr *physical_function)
2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462
{
    int ret = -1;
    char *device_link = NULL;

    VIR_DEBUG("Attempting to get SR IOV physical function for device "
              "with sysfs path '%s'", vf_sysfs_path);

    if (virBuildPath(&device_link, vf_sysfs_path, "physfn") == -1) {
        virReportOOMError();
        return ret;
    } else {
2463 2464
        ret = virPCIGetDeviceAddressFromSysfsLink(device_link,
                                                  physical_function);
2465 2466 2467 2468 2469 2470 2471
    }

    VIR_FREE(device_link);

    return ret;
}

2472

2473 2474 2475 2476
/*
 * Returns virtual functions of a physical function
 */
int
2477 2478
virPCIGetVirtualFunctions(const char *sysfs_path,
                          virPCIDeviceAddressPtr **virtual_functions,
2479
                          size_t *num_virtual_functions)
2480 2481
{
    int ret = -1;
2482
    size_t i;
2483
    char *device_link = NULL;
2484
    virPCIDeviceAddress *config_addr = NULL;
2485 2486 2487 2488

    VIR_DEBUG("Attempting to get SR IOV virtual functions for device"
              "with sysfs path '%s'", sysfs_path);

2489 2490 2491
    *virtual_functions = NULL;
    *num_virtual_functions = 0;

2492 2493 2494 2495
    do {
        /* look for virtfn%d links until one isn't found */
        if (virAsprintf(&device_link, "%s/virtfn%zu", sysfs_path, *num_virtual_functions) < 0)
            goto error;
2496

2497 2498
        if (!virFileExists(device_link))
            break;
2499

2500 2501 2502 2503 2504 2505
        if (virPCIGetDeviceAddressFromSysfsLink(device_link, &config_addr) < 0) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Failed to get SRIOV function from device link '%s'"),
                           device_link);
            goto error;
        }
2506

2507 2508 2509 2510
        VIR_DEBUG("Found virtual function %zu", *num_virtual_functions);
        if (VIR_APPEND_ELEMENT(*virtual_functions, *num_virtual_functions, config_addr) < 0)
            goto error;
        VIR_FREE(device_link);
2511

2512
    } while (1);
2513 2514

    ret = 0;
2515 2516
cleanup:
    VIR_FREE(device_link);
2517
    VIR_FREE(config_addr);
2518
    return ret;
2519 2520

error:
2521 2522 2523
    for (i = 0; i < *num_virtual_functions; i++)
        VIR_FREE((*virtual_functions)[i]);
    VIR_FREE(*virtual_functions);
2524
    goto cleanup;
2525
}
2526

2527

2528 2529 2530 2531
/*
 * Returns 1 if vf device is a virtual function, 0 if not, -1 on error
 */
int
2532
virPCIIsVirtualFunction(const char *vf_sysfs_device_link)
2533 2534 2535 2536 2537
{
    char *vf_sysfs_physfn_link = NULL;
    int ret = -1;

    if (virAsprintf(&vf_sysfs_physfn_link, "%s/physfn",
2538
                    vf_sysfs_device_link) < 0)
2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551
        return ret;

    ret = virFileExists(vf_sysfs_physfn_link);

    VIR_FREE(vf_sysfs_physfn_link);

    return ret;
}

/*
 * Returns the sriov virtual function index of vf given its pf
 */
int
2552 2553 2554
virPCIGetVirtualFunctionIndex(const char *pf_sysfs_device_link,
                              const char *vf_sysfs_device_link,
                              int *vf_index)
2555
{
2556 2557
    int ret = -1;
    size_t i;
2558
    size_t num_virt_fns = 0;
2559 2560
    virPCIDeviceAddressPtr vf_bdf = NULL;
    virPCIDeviceAddressPtr *virt_fns = NULL;
2561

2562 2563
    if (virPCIGetDeviceAddressFromSysfsLink(vf_sysfs_device_link,
                                            &vf_bdf) < 0)
2564 2565
        return ret;

2566 2567
    if (virPCIGetVirtualFunctions(pf_sysfs_device_link, &virt_fns,
                                  &num_virt_fns) < 0) {
2568
        virReportError(VIR_ERR_INTERNAL_ERROR,
2569
                       _("Error getting physical function's '%s' "
2570
                         "virtual_functions"), pf_sysfs_device_link);
2571 2572 2573 2574
        goto out;
    }

    for (i = 0; i < num_virt_fns; i++) {
2575 2576 2577 2578 2579
        if (virPCIDeviceAddressIsEqual(vf_bdf, virt_fns[i])) {
            *vf_index = i;
            ret = 0;
            break;
        }
2580 2581 2582 2583 2584 2585
    }

out:

    /* free virtual functions */
    for (i = 0; i < num_virt_fns; i++)
2586
        VIR_FREE(virt_fns[i]);
2587

A
ajia@redhat.com 已提交
2588
    VIR_FREE(virt_fns);
2589 2590 2591 2592 2593
    VIR_FREE(vf_bdf);

    return ret;
}

2594 2595 2596 2597 2598
/*
 * Returns a path to the PCI sysfs file given the BDF of the PCI function
 */

int
2599
virPCIGetSysfsFile(char *virPCIDeviceName, char **pci_sysfs_device_link)
2600
{
2601 2602 2603 2604
    if (virAsprintf(pci_sysfs_device_link, PCI_SYSFS "devices/%s",
                    virPCIDeviceName) < 0)
        return -1;
    return 0;
2605 2606
}

R
Roopa Prabhu 已提交
2607
int
2608 2609
virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddressPtr dev,
                                char **pci_sysfs_device_link)
R
Roopa Prabhu 已提交
2610
{
2611 2612 2613 2614 2615
    if (virAsprintf(pci_sysfs_device_link,
                    PCI_SYSFS "devices/%04x:%02x:%02x.%x", dev->domain,
                    dev->bus, dev->slot, dev->function) < 0)
        return -1;
    return 0;
R
Roopa Prabhu 已提交
2616 2617
}

2618 2619 2620 2621
/*
 * Returns the network device name of a pci device
 */
int
2622 2623 2624 2625 2626 2627
virPCIGetNetName(char *device_link_sysfs_path, char **netname)
{
    char *pcidev_sysfs_net_path = NULL;
    int ret = -1;
    DIR *dir = NULL;
    struct dirent *entry = NULL;
2628

2629 2630 2631 2632 2633 2634 2635 2636 2637
    if (virBuildPath(&pcidev_sysfs_net_path, device_link_sysfs_path,
                     "net") == -1) {
        virReportOOMError();
        return -1;
    }

    dir = opendir(pcidev_sysfs_net_path);
    if (dir == NULL)
        goto out;
2638

2639 2640 2641 2642 2643 2644
    while ((entry = readdir(dir))) {
        if (STREQ(entry->d_name, ".") ||
            STREQ(entry->d_name, ".."))
            continue;

        /* Assume a single directory entry */
2645
        if (VIR_STRDUP(*netname, entry->d_name) > 0)
2646 2647 2648 2649 2650
            ret = 0;
        break;
    }

    closedir(dir);
2651 2652

out:
2653
    VIR_FREE(pcidev_sysfs_net_path);
2654

2655
    return ret;
2656
}
R
Roopa Prabhu 已提交
2657 2658

int
2659 2660
virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path,
                             char **pfname, int *vf_index)
R
Roopa Prabhu 已提交
2661
{
2662
    virPCIDeviceAddressPtr pf_config_address = NULL;
R
Roopa Prabhu 已提交
2663 2664 2665
    char *pf_sysfs_device_path = NULL;
    int ret = -1;

2666
    if (virPCIGetPhysicalFunction(vf_sysfs_device_path, &pf_config_address) < 0)
R
Roopa Prabhu 已提交
2667 2668
        return ret;

2669 2670
    if (virPCIDeviceAddressGetSysfsFile(pf_config_address,
                                        &pf_sysfs_device_path) < 0) {
R
Roopa Prabhu 已提交
2671 2672 2673 2674 2675

        VIR_FREE(pf_config_address);
        return ret;
    }

2676 2677
    if (virPCIGetVirtualFunctionIndex(pf_sysfs_device_path, vf_sysfs_device_path,
                                      vf_index) < 0)
R
Roopa Prabhu 已提交
2678 2679
        goto cleanup;

2680
    ret = virPCIGetNetName(pf_sysfs_device_path, pfname);
R
Roopa Prabhu 已提交
2681 2682 2683 2684 2685 2686 2687 2688

cleanup:
    VIR_FREE(pf_config_address);
    VIR_FREE(pf_sysfs_device_path);

    return ret;
}

2689
#else
2690 2691
static const char *unsupported = N_("not supported on non-linux platforms");

2692
int
2693 2694
virPCIGetPhysicalFunction(const char *vf_sysfs_path ATTRIBUTE_UNUSED,
                          virPCIDeviceAddressPtr *physical_function ATTRIBUTE_UNUSED)
2695
{
2696
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2697 2698 2699 2700
    return -1;
}

int
2701 2702
virPCIGetVirtualFunctions(const char *sysfs_path ATTRIBUTE_UNUSED,
                          virPCIDeviceAddressPtr **virtual_functions ATTRIBUTE_UNUSED,
2703
                          size_t *num_virtual_functions ATTRIBUTE_UNUSED)
2704
{
2705
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2706 2707
    return -1;
}
2708 2709

int
E
Eric Blake 已提交
2710
virPCIIsVirtualFunction(const char *vf_sysfs_device_link ATTRIBUTE_UNUSED)
2711
{
2712
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2713 2714 2715 2716
    return -1;
}

int
2717 2718 2719
virPCIGetVirtualFunctionIndex(const char *pf_sysfs_device_link ATTRIBUTE_UNUSED,
                              const char *vf_sysfs_device_link ATTRIBUTE_UNUSED,
                              int *vf_index ATTRIBUTE_UNUSED)
2720
{
2721
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2722 2723 2724 2725
    return -1;

}

2726
int
2727 2728
virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddressPtr dev ATTRIBUTE_UNUSED,
                                char **pci_sysfs_device_link ATTRIBUTE_UNUSED)
2729
{
2730
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2731 2732 2733
    return -1;
}

2734
int
2735
virPCIGetNetName(char *device_link_sysfs_path ATTRIBUTE_UNUSED,
2736
                 char **netname ATTRIBUTE_UNUSED)
2737
{
2738
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2739 2740
    return -1;
}
R
Roopa Prabhu 已提交
2741 2742

int
2743 2744 2745
virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path ATTRIBUTE_UNUSED,
                             char **pfname ATTRIBUTE_UNUSED,
                             int *vf_index ATTRIBUTE_UNUSED)
R
Roopa Prabhu 已提交
2746
{
2747
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
R
Roopa Prabhu 已提交
2748 2749
    return -1;
}
2750
#endif /* __linux__ */