virpci.c 75.8 KB
Newer Older
1
/*
2 3
 * virpci.c: helper APIs for managing host PCI devices
 *
E
Eric Blake 已提交
4
 * Copyright (C) 2009-2013 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library.  If not, see
O
Osier Yang 已提交
18
 * <http://www.gnu.org/licenses/>.
19 20 21 22 23 24 25
 *
 * Authors:
 *     Mark McLoughlin <markmc@redhat.com>
 */

#include <config.h>

26
#include "virpci.h"
27 28 29 30 31 32 33 34 35 36

#include <dirent.h>
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
37
#include <stdlib.h>
38

39
#include "dirname.h"
40
#include "virlog.h"
41
#include "viralloc.h"
42
#include "vircommand.h"
43
#include "virerror.h"
E
Eric Blake 已提交
44
#include "virfile.h"
45
#include "virkmod.h"
46 47
#include "virstring.h"
#include "virutil.h"
48

49 50
VIR_LOG_INIT("util.pci");

51 52 53 54
#define PCI_SYSFS "/sys/bus/pci/"
#define PCI_ID_LEN 10   /* "XXXX XXXX" */
#define PCI_ADDR_LEN 13 /* "XXXX:XX:XX.X" */

55
struct _virPCIDevice {
56 57 58 59
    unsigned int  domain;
    unsigned int  bus;
    unsigned int  slot;
    unsigned int  function;
60 61 62

    char          name[PCI_ADDR_LEN]; /* domain:bus:slot.function */
    char          id[PCI_ID_LEN];     /* product vendor */
E
Eric Blake 已提交
63
    char          *path;
C
Chunyan Liu 已提交
64 65 66 67

    /* The driver:domain which uses the device */
    char          *used_by_drvname;
    char          *used_by_domname;
68

69 70
    unsigned int  pcie_cap_pos;
    unsigned int  pci_pm_cap_pos;
71 72
    bool          has_flr;
    bool          has_pm_reset;
73
    bool          managed;
74
    char          *stubDriver;
75 76

    /* used by reattach function */
77 78 79
    bool          unbind_from_stub;
    bool          remove_slot;
    bool          reprobe;
80 81
};

82
struct _virPCIDeviceList {
83 84
    virObjectLockable parent;

85
    size_t count;
86
    virPCIDevicePtr *devs;
87 88 89
};


90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
/* For virReportOOMError()  and virReportSystemError() */
#define VIR_FROM_THIS VIR_FROM_NONE

/* Specifications referenced in comments:
 *  PCI30  - PCI Local Bus Specification 3.0
 *  PCIe20 - PCI Express Base Specification 2.0
 *  BR12   - PCI-to-PCI Bridge Architecture Specification 1.2
 *  PM12   - PCI Bus Power Management Interface Specification 1.2
 *  ECN_AF - Advanced Capabilities for Conventional PCI ECN
 */

/* Type 0 config space header length; PCI30 Section 6.1 Configuration Space Organization */
#define PCI_CONF_LEN            0x100
#define PCI_CONF_HEADER_LEN     0x40

/* PCI30 6.2.1 */
#define PCI_HEADER_TYPE         0x0e    /* Header type */
107 108 109
#define PCI_HEADER_TYPE_BRIDGE 0x1
#define PCI_HEADER_TYPE_MASK   0x7f
#define PCI_HEADER_TYPE_MULTI  0x80
110 111 112 113 114 115 116 117 118

/* PCI30 6.2.1  Device Identification */
#define PCI_CLASS_DEVICE        0x0a    /* Device class */

/* Class Code for bridge; PCI30 D.7  Base Class 06h */
#define PCI_CLASS_BRIDGE_PCI    0x0604

/* PCI30 6.2.3  Device Status */
#define PCI_STATUS              0x06    /* 16 bits */
119
#define PCI_STATUS_CAP_LIST    0x10    /* Support Capability List */
120 121 122 123 124 125 126 127 128 129 130 131 132

/* PCI30 6.7  Capabilities List */
#define PCI_CAPABILITY_LIST     0x34    /* Offset of first capability list entry */

/* PM12 3.2.1  Capability Identifier */
#define PCI_CAP_ID_PM           0x01    /* Power Management */
/* PCI30 H Capability IDs */
#define PCI_CAP_ID_EXP          0x10    /* PCI Express */
/* ECN_AF 6.x.1.1  Capability ID for AF */
#define PCI_CAP_ID_AF           0x13    /* Advanced Features */

/* PCIe20 7.8.3  Device Capabilities Register (Offset 04h) */
#define PCI_EXP_DEVCAP          0x4     /* Device capabilities */
133
#define PCI_EXP_DEVCAP_FLR     (1<<28) /* Function Level Reset */
134 135 136 137 138 139 140

/* Header type 1 BR12 3.2 PCI-to-PCI Bridge Configuration Space Header Format */
#define PCI_PRIMARY_BUS         0x18    /* BR12 3.2.5.2 Primary bus number */
#define PCI_SECONDARY_BUS       0x19    /* BR12 3.2.5.3 Secondary bus number */
#define PCI_SUBORDINATE_BUS     0x1a    /* BR12 3.2.5.4 Highest bus number behind the bridge */
#define PCI_BRIDGE_CONTROL      0x3e
/* BR12 3.2.5.18  Bridge Control Register */
141
#define PCI_BRIDGE_CTL_RESET   0x40    /* Secondary bus reset */
142 143 144

/* PM12 3.2.4  Power Management Control/Status (Offset = 4) */
#define PCI_PM_CTRL                4    /* PM control and status register */
145 146 147 148
#define PCI_PM_CTRL_STATE_MASK    0x3  /* Current power state (D0 to D3) */
#define PCI_PM_CTRL_STATE_D0      0x0  /* D0 state */
#define PCI_PM_CTRL_STATE_D3hot   0x3  /* D3 state */
#define PCI_PM_CTRL_NO_SOFT_RESET 0x8  /* No reset for D3hot->D0 */
149 150 151

/* ECN_AF 6.x.1  Advanced Features Capability Structure */
#define PCI_AF_CAP              0x3     /* Advanced features capabilities */
152
#define PCI_AF_CAP_FLR         0x2     /* Function Level Reset */
153

J
Jiri Denemark 已提交
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
#define PCI_EXP_FLAGS           0x2
#define PCI_EXP_FLAGS_TYPE      0x00f0
#define PCI_EXP_TYPE_DOWNSTREAM 0x6

#define PCI_EXT_CAP_BASE          0x100
#define PCI_EXT_CAP_LIMIT         0x1000
#define PCI_EXT_CAP_ID_MASK       0x0000ffff
#define PCI_EXT_CAP_OFFSET_SHIFT  20
#define PCI_EXT_CAP_OFFSET_MASK   0x00000ffc

#define PCI_EXT_CAP_ID_ACS      0x000d
#define PCI_EXT_ACS_CTRL        0x06

#define PCI_EXT_CAP_ACS_SV      0x01
#define PCI_EXT_CAP_ACS_RR      0x04
#define PCI_EXT_CAP_ACS_CR      0x08
#define PCI_EXT_CAP_ACS_UF      0x10
171 172 173
#define PCI_EXT_CAP_ACS_ENABLED (PCI_EXT_CAP_ACS_SV |   \
                                 PCI_EXT_CAP_ACS_RR |   \
                                 PCI_EXT_CAP_ACS_CR |   \
J
Jiri Denemark 已提交
174 175
                                 PCI_EXT_CAP_ACS_UF)

176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
static virClassPtr virPCIDeviceListClass;

static void virPCIDeviceListDispose(void *obj);

static int virPCIOnceInit(void)
{
    if (!(virPCIDeviceListClass = virClassNew(virClassForObjectLockable(),
                                              "virPCIDeviceList",
                                              sizeof(virPCIDeviceList),
                                              virPCIDeviceListDispose)))
        return -1;

    return 0;
}

VIR_ONCE_GLOBAL_INIT(virPCI)

L
Laine Stump 已提交
193 194 195 196 197 198

static int
virPCIDriverDir(char **buffer, const char *driver)
{
    VIR_FREE(*buffer);

199 200 201
    if (virAsprintf(buffer, PCI_SYSFS "drivers/%s", driver) < 0)
        return -1;
    return 0;
L
Laine Stump 已提交
202 203 204 205 206 207 208 209
}


static int
virPCIDriverFile(char **buffer, const char *driver, const char *file)
{
    VIR_FREE(*buffer);

210 211 212
    if (virAsprintf(buffer, PCI_SYSFS "drivers/%s/%s", driver, file) < 0)
        return -1;
    return 0;
L
Laine Stump 已提交
213 214 215 216 217 218 219 220
}


static int
virPCIFile(char **buffer, const char *device, const char *file)
{
    VIR_FREE(*buffer);

221 222 223
    if (virAsprintf(buffer, PCI_SYSFS "devices/%s/%s", device, file) < 0)
        return -1;
    return 0;
L
Laine Stump 已提交
224 225 226 227 228 229 230 231 232 233
}


/* virPCIDeviceGetDriverPathAndName - put the path to the driver
 * directory of the driver in use for this device in @path and the
 * name of the driver in @name. Both could be NULL if it's not bound
 * to any driver.
 *
 * Return 0 for success, -1 for error.
 */
234
int
L
Laine Stump 已提交
235 236 237 238 239 240 241 242 243 244
virPCIDeviceGetDriverPathAndName(virPCIDevicePtr dev, char **path, char **name)
{
    int ret = -1;
    char *drvlink = NULL;

    *path = *name = NULL;
    /* drvlink = "/sys/bus/pci/dddd:bb:ss.ff/driver" */
    if (virPCIFile(&drvlink, dev->name, "driver") < 0)
        goto cleanup;

245 246 247 248 249
    if (!virFileExists(drvlink)) {
        ret = 0;
        goto cleanup;
    }

L
Laine Stump 已提交
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
    if (virFileIsLink(drvlink) != 1) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Invalid device %s driver file %s is not a symlink"),
                       dev->name, drvlink);
        goto cleanup;
    }
    if (virFileResolveLink(drvlink, path) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s driver symlink %s"),
                       dev->name, drvlink);
        goto cleanup;
    }
    /* path = "/sys/bus/pci/drivers/${drivername}" */

    if (VIR_STRDUP(*name, last_component(*path)) < 0)
        goto cleanup;
    /* name = "${drivername}" */

    ret = 0;
269
 cleanup:
L
Laine Stump 已提交
270 271 272 273 274 275 276 277 278
    VIR_FREE(drvlink);
    if (ret < 0) {
        VIR_FREE(*path);
        VIR_FREE(*name);
    }
    return ret;
}


279
static int
280
virPCIDeviceConfigOpen(virPCIDevicePtr dev, bool fatal)
281 282 283 284
{
    int fd;

    fd = open(dev->path, O_RDWR);
285

286
    if (fd < 0) {
287 288 289 290 291 292 293 294 295
        if (fatal) {
            virReportSystemError(errno,
                                 _("Failed to open config space file '%s'"),
                                 dev->path);
        } else {
            char ebuf[1024];
            VIR_WARN("Failed to open config space file '%s': %s",
                     dev->path, virStrerror(errno, ebuf, sizeof(ebuf)));
        }
296 297
        return -1;
    }
298

299
    VIR_DEBUG("%s %s: opened %s", dev->id, dev->name, dev->path);
300
    return fd;
301 302
}

303
static void
304
virPCIDeviceConfigClose(virPCIDevicePtr dev, int cfgfd)
305
{
306 307 308 309 310
    if (VIR_CLOSE(cfgfd) < 0) {
        char ebuf[1024];
        VIR_WARN("Failed to close config space file '%s': %s",
                 dev->path, virStrerror(errno, ebuf, sizeof(ebuf)));
    }
311 312
}

313

314
static int
315 316
virPCIDeviceRead(virPCIDevicePtr dev,
                 int cfgfd,
317
                 unsigned int pos,
318
                 uint8_t *buf,
319
                 unsigned int buflen)
320 321 322
{
    memset(buf, 0, buflen);

323 324
    if (lseek(cfgfd, pos, SEEK_SET) != pos ||
        saferead(cfgfd, buf, buflen) != buflen) {
325
        char ebuf[1024];
326
        VIR_WARN("Failed to read from '%s' : %s", dev->path,
327 328 329 330 331 332 333
                 virStrerror(errno, ebuf, sizeof(ebuf)));
        return -1;
    }
    return 0;
}

static uint8_t
334
virPCIDeviceRead8(virPCIDevicePtr dev, int cfgfd, unsigned int pos)
335 336
{
    uint8_t buf;
337
    virPCIDeviceRead(dev, cfgfd, pos, &buf, sizeof(buf));
338 339 340 341
    return buf;
}

static uint16_t
342
virPCIDeviceRead16(virPCIDevicePtr dev, int cfgfd, unsigned int pos)
343 344
{
    uint8_t buf[2];
345
    virPCIDeviceRead(dev, cfgfd, pos, &buf[0], sizeof(buf));
346 347 348 349
    return (buf[0] << 0) | (buf[1] << 8);
}

static uint32_t
350
virPCIDeviceRead32(virPCIDevicePtr dev, int cfgfd, unsigned int pos)
351 352
{
    uint8_t buf[4];
353
    virPCIDeviceRead(dev, cfgfd, pos, &buf[0], sizeof(buf));
354 355 356
    return (buf[0] << 0) | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
}

357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
static int
virPCIDeviceReadClass(virPCIDevicePtr dev, uint16_t *device_class)
{
    char *path = NULL;
    char *id_str = NULL;
    int ret = -1;
    unsigned int value;

    if (virPCIFile(&path, dev->name, "class") < 0)
        return ret;

    /* class string is '0xNNNNNN\n' ... i.e. 9 bytes */
    if (virFileReadAll(path, 9, &id_str) < 0)
        goto cleanup;

    id_str[8] = '\0';
    if (virStrToLong_ui(id_str, NULL, 16, &value) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unusual value in %s/devices/%s/class: %s"),
                       PCI_SYSFS, dev->name, id_str);
        goto cleanup;
    }

    *device_class = (value >> 8) & 0xFFFF;
    ret = 0;
382
 cleanup:
383 384 385 386 387
    VIR_FREE(id_str);
    VIR_FREE(path);
    return ret;
}

388
static int
389 390
virPCIDeviceWrite(virPCIDevicePtr dev,
                  int cfgfd,
391
                  unsigned int pos,
392
                  uint8_t *buf,
393
                  unsigned int buflen)
394
{
395 396
    if (lseek(cfgfd, pos, SEEK_SET) != pos ||
        safewrite(cfgfd, buf, buflen) != buflen) {
397
        char ebuf[1024];
398
        VIR_WARN("Failed to write to '%s' : %s", dev->path,
399 400 401 402 403 404 405
                 virStrerror(errno, ebuf, sizeof(ebuf)));
        return -1;
    }
    return 0;
}

static void
406
virPCIDeviceWrite16(virPCIDevicePtr dev, int cfgfd, unsigned int pos, uint16_t val)
407 408
{
    uint8_t buf[2] = { (val >> 0), (val >> 8) };
409
    virPCIDeviceWrite(dev, cfgfd, pos, &buf[0], sizeof(buf));
410 411 412
}

static void
413
virPCIDeviceWrite32(virPCIDevicePtr dev, int cfgfd, unsigned int pos, uint32_t val)
414
{
415
    uint8_t buf[4] = { (val >> 0), (val >> 8), (val >> 16), (val >> 24) };
416
    virPCIDeviceWrite(dev, cfgfd, pos, &buf[0], sizeof(buf));
417 418
}

E
Eric Blake 已提交
419 420
typedef int (*virPCIDeviceIterPredicate)(virPCIDevicePtr, virPCIDevicePtr,
                                         void *);
421 422 423 424 425 426 427

/* Iterate over available PCI devices calling @predicate
 * to compare each one to @dev.
 * Return -1 on error since we don't want to assume it is
 * safe to reset if there is an error.
 */
static int
428 429 430 431
virPCIDeviceIterDevices(virPCIDeviceIterPredicate predicate,
                        virPCIDevicePtr dev,
                        virPCIDevicePtr *matched,
                        void *data)
432 433 434
{
    DIR *dir;
    struct dirent *entry;
435
    int ret = 0;
436
    int rc;
437 438 439 440 441 442 443

    *matched = NULL;

    VIR_DEBUG("%s %s: iterating over " PCI_SYSFS "devices", dev->id, dev->name);

    dir = opendir(PCI_SYSFS "devices");
    if (!dir) {
444
        VIR_WARN("Failed to open " PCI_SYSFS "devices");
445 446 447 448
        return -1;
    }

    while ((entry = readdir(dir))) {
449
        unsigned int domain, bus, slot, function;
450
        virPCIDevicePtr check;
451
        char *tmp;
452 453 454 455 456

        /* Ignore '.' and '..' */
        if (entry->d_name[0] == '.')
            continue;

457 458 459 460 461 462 463 464 465
        /* expected format: <domain>:<bus>:<slot>.<function> */
        if (/* domain */
            virStrToLong_ui(entry->d_name, &tmp, 16, &domain) < 0 || *tmp != ':' ||
            /* bus */
            virStrToLong_ui(tmp + 1, &tmp, 16, &bus) < 0 || *tmp != ':' ||
            /* slot */
            virStrToLong_ui(tmp + 1, &tmp, 16, &slot) < 0 || *tmp != '.' ||
            /* function */
            virStrToLong_ui(tmp + 1, NULL, 16, &function) < 0) {
466 467 468 469
            VIR_WARN("Unusual entry in " PCI_SYSFS "devices: %s", entry->d_name);
            continue;
        }

470
        check = virPCIDeviceNew(domain, bus, slot, function);
471
        if (!check) {
472 473 474
            ret = -1;
            break;
        }
475

476 477 478
        rc = predicate(dev, check, data);
        if (rc < 0) {
            /* the predicate returned an error, bail */
479
            virPCIDeviceFree(check);
480 481 482 483
            ret = -1;
            break;
        }
        else if (rc == 1) {
484 485
            VIR_DEBUG("%s %s: iter matched on %s", dev->id, dev->name, check->name);
            *matched = check;
486
            ret = 1;
487 488
            break;
        }
489

490
        virPCIDeviceFree(check);
491 492
    }
    closedir(dir);
493
    return ret;
494 495 496
}

static uint8_t
497 498 499
virPCIDeviceFindCapabilityOffset(virPCIDevicePtr dev,
                                 int cfgfd,
                                 unsigned int capability)
500 501 502 503
{
    uint16_t status;
    uint8_t pos;

504
    status = virPCIDeviceRead16(dev, cfgfd, PCI_STATUS);
505 506 507
    if (!(status & PCI_STATUS_CAP_LIST))
        return 0;

508
    pos = virPCIDeviceRead8(dev, cfgfd, PCI_CAPABILITY_LIST);
509 510 511 512 513 514 515 516 517

    /* Zero indicates last capability, capabilities can't
     * be in the config space header and 0xff is returned
     * by the kernel if we don't have access to this region
     *
     * Note: we're not handling loops or extended
     * capabilities here.
     */
    while (pos >= PCI_CONF_HEADER_LEN && pos != 0xff) {
518
        uint8_t capid = virPCIDeviceRead8(dev, cfgfd, pos);
519 520 521 522 523 524
        if (capid == capability) {
            VIR_DEBUG("%s %s: found cap 0x%.2x at 0x%.2x",
                      dev->id, dev->name, capability, pos);
            return pos;
        }

525
        pos = virPCIDeviceRead8(dev, cfgfd, pos + 1);
526 527 528 529 530 531 532
    }

    VIR_DEBUG("%s %s: failed to find cap 0x%.2x", dev->id, dev->name, capability);

    return 0;
}

J
Jiri Denemark 已提交
533
static unsigned int
534 535
virPCIDeviceFindExtendedCapabilityOffset(virPCIDevicePtr dev,
                                         int cfgfd,
536
                                         unsigned int capability)
J
Jiri Denemark 已提交
537 538 539 540 541 542 543 544 545 546
{
    int ttl;
    unsigned int pos;
    uint32_t header;

    /* minimum 8 bytes per capability */
    ttl = (PCI_EXT_CAP_LIMIT - PCI_EXT_CAP_BASE) / 8;
    pos = PCI_EXT_CAP_BASE;

    while (ttl > 0 && pos >= PCI_EXT_CAP_BASE) {
547
        header = virPCIDeviceRead32(dev, cfgfd, pos);
J
Jiri Denemark 已提交
548 549 550 551 552 553 554 555 556 557 558

        if ((header & PCI_EXT_CAP_ID_MASK) == capability)
            return pos;

        pos = (header >> PCI_EXT_CAP_OFFSET_SHIFT) & PCI_EXT_CAP_OFFSET_MASK;
        ttl--;
    }

    return 0;
}

559 560 561 562
/* detects whether this device has FLR.  Returns 0 if the device does
 * not have FLR, 1 if it does, and -1 on error
 */
static int
563
virPCIDeviceDetectFunctionLevelReset(virPCIDevicePtr dev, int cfgfd)
564
{
M
Mark McLoughlin 已提交
565
    uint32_t caps;
566
    uint8_t pos;
567 568
    char *path;
    int found;
569 570 571 572 573 574 575 576

    /* The PCIe Function Level Reset capability allows
     * individual device functions to be reset without
     * affecting any other functions on the device or
     * any other devices on the bus. This is only common
     * on SR-IOV NICs at the moment.
     */
    if (dev->pcie_cap_pos) {
577
        caps = virPCIDeviceRead32(dev, cfgfd, dev->pcie_cap_pos + PCI_EXP_DEVCAP);
578 579 580 581 582 583 584 585 586 587
        if (caps & PCI_EXP_DEVCAP_FLR) {
            VIR_DEBUG("%s %s: detected PCIe FLR capability", dev->id, dev->name);
            return 1;
        }
    }

    /* The PCI AF Function Level Reset capability is
     * the same thing, except for conventional PCI
     * devices. This is not common yet.
     */
588
    pos = virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_AF);
589
    if (pos) {
590
        caps = virPCIDeviceRead16(dev, cfgfd, pos + PCI_AF_CAP);
591 592 593 594 595 596
        if (caps & PCI_AF_CAP_FLR) {
            VIR_DEBUG("%s %s: detected PCI FLR capability", dev->id, dev->name);
            return 1;
        }
    }

597 598 599 600 601 602
    /* there are some buggy devices that do support FLR, but forget to
     * advertise that fact in their capabilities.  However, FLR is *required*
     * to be present for virtual functions (VFs), so if we see that this
     * device is a VF, we just assume FLR works
     */

603
    if (virAsprintf(&path, PCI_SYSFS "devices/%s/physfn", dev->name) < 0)
604 605 606 607 608 609 610 611 612 613
        return -1;

    found = virFileExists(path);
    VIR_FREE(path);
    if (found) {
        VIR_DEBUG("%s %s: buggy device didn't advertise FLR, but is a VF; forcing flr on",
                  dev->id, dev->name);
        return 1;
    }

614 615 616 617 618 619 620 621 622
    VIR_DEBUG("%s %s: no FLR capability found", dev->id, dev->name);

    return 0;
}

/* Require the device has the PCI Power Management capability
 * and that a D3hot->D0 transition will results in a full
 * internal reset, not just a soft reset.
 */
623
static unsigned int
624
virPCIDeviceDetectPowerManagementReset(virPCIDevicePtr dev, int cfgfd)
625 626 627 628 629
{
    if (dev->pci_pm_cap_pos) {
        uint32_t ctl;

        /* require the NO_SOFT_RESET bit is clear */
630
        ctl = virPCIDeviceRead32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL);
631 632 633 634 635 636 637 638 639 640 641
        if (!(ctl & PCI_PM_CTRL_NO_SOFT_RESET)) {
            VIR_DEBUG("%s %s: detected PM reset capability", dev->id, dev->name);
            return 1;
        }
    }

    VIR_DEBUG("%s %s: no PM reset capability found", dev->id, dev->name);

    return 0;
}

642
/* Any active devices on the same domain/bus ? */
643
static int
644
virPCIDeviceSharesBusWithActive(virPCIDevicePtr dev, virPCIDevicePtr check, void *data)
645
{
646
    virPCIDeviceList *inactiveDevs = data;
647

648
    /* Different domain, different bus, or simply identical device */
649 650
    if (dev->domain != check->domain ||
        dev->bus != check->bus ||
651 652
        (dev->slot == check->slot &&
         dev->function == check->function))
653 654
        return 0;

655
    /* same bus, but inactive, i.e. about to be assigned to guest */
656
    if (inactiveDevs && virPCIDeviceListFind(inactiveDevs, check))
657
        return 0;
658

659
    return 1;
660 661
}

662 663 664
static virPCIDevicePtr
virPCIDeviceBusContainsActiveDevices(virPCIDevicePtr dev,
                                     virPCIDeviceList *inactiveDevs)
665
{
666 667 668
    virPCIDevicePtr active = NULL;
    if (virPCIDeviceIterDevices(virPCIDeviceSharesBusWithActive,
                                dev, &active, inactiveDevs) < 0)
669 670 671 672 673
        return NULL;
    return active;
}

/* Is @check the parent of @dev ? */
674
static int
675
virPCIDeviceIsParent(virPCIDevicePtr dev, virPCIDevicePtr check, void *data)
676 677 678
{
    uint16_t device_class;
    uint8_t header_type, secondary, subordinate;
679
    virPCIDevicePtr *best = data;
680 681
    int ret = 0;
    int fd;
682

683
    if (dev->domain != check->domain)
684 685
        return 0;

686
    if ((fd = virPCIDeviceConfigOpen(check, false)) < 0)
687 688
        return 0;

689
    /* Is it a bridge? */
690 691
    ret = virPCIDeviceReadClass(check, &device_class);
    if (ret < 0 || device_class != PCI_CLASS_BRIDGE_PCI)
692
        goto cleanup;
693 694

    /* Is it a plane? */
695
    header_type = virPCIDeviceRead8(check, fd, PCI_HEADER_TYPE);
696
    if ((header_type & PCI_HEADER_TYPE_MASK) != PCI_HEADER_TYPE_BRIDGE)
697
        goto cleanup;
698

699 700
    secondary   = virPCIDeviceRead8(check, fd, PCI_SECONDARY_BUS);
    subordinate = virPCIDeviceRead8(check, fd, PCI_SUBORDINATE_BUS);
701

702
    VIR_DEBUG("%s %s: found parent device %s", dev->id, dev->name, check->name);
703

704 705 706
    /* if the secondary bus exactly equals the device's bus, then we found
     * the direct parent.  No further work is necessary
     */
707 708 709 710
    if (dev->bus == secondary) {
        ret = 1;
        goto cleanup;
    }
711

712
    /* otherwise, SRIOV allows VFs to be on different buses than their PFs.
713 714 715 716 717
     * In this case, what we need to do is look for the "best" match; i.e.
     * the most restrictive match that still satisfies all of the conditions.
     */
    if (dev->bus > secondary && dev->bus <= subordinate) {
        if (*best == NULL) {
718 719
            *best = virPCIDeviceNew(check->domain, check->bus, check->slot,
                                    check->function);
720 721 722 723 724
            if (*best == NULL) {
                ret = -1;
                goto cleanup;
            }
        } else {
725 726 727 728
            /* OK, we had already recorded a previous "best" match for the
             * parent.  See if the current device is more restrictive than the
             * best, and if so, make it the new best
             */
729 730 731
            int bestfd;
            uint8_t best_secondary;

732
            if ((bestfd = virPCIDeviceConfigOpen(*best, false)) < 0)
733
                goto cleanup;
734 735
            best_secondary = virPCIDeviceRead8(*best, bestfd, PCI_SECONDARY_BUS);
            virPCIDeviceConfigClose(*best, bestfd);
736 737

            if (secondary > best_secondary) {
738 739 740
                virPCIDeviceFree(*best);
                *best = virPCIDeviceNew(check->domain, check->bus, check->slot,
                                        check->function);
741 742 743 744
                if (*best == NULL) {
                    ret = -1;
                    goto cleanup;
                }
745 746 747 748
            }
        }
    }

749
 cleanup:
750
    virPCIDeviceConfigClose(check, fd);
751
    return ret;
752 753
}

754
static int
755
virPCIDeviceGetParent(virPCIDevicePtr dev, virPCIDevicePtr *parent)
756
{
757
    virPCIDevicePtr best = NULL;
758 759 760
    int ret;

    *parent = NULL;
761
    ret = virPCIDeviceIterDevices(virPCIDeviceIsParent, dev, parent, &best);
762
    if (ret == 1)
763
        virPCIDeviceFree(best);
764 765 766
    else if (ret == 0)
        *parent = best;
    return ret;
767 768 769 770 771 772
}

/* Secondary Bus Reset is our sledgehammer - it resets all
 * devices behind a bus.
 */
static int
773 774 775
virPCIDeviceTrySecondaryBusReset(virPCIDevicePtr dev,
                                 int cfgfd,
                                 virPCIDeviceList *inactiveDevs)
776
{
777
    virPCIDevicePtr parent, conflict;
778 779 780
    uint8_t config_space[PCI_CONF_LEN];
    uint16_t ctl;
    int ret = -1;
781
    int parentfd;
782

783 784 785
    /* Refuse to do a secondary bus reset if there are other
     * devices/functions behind the bus are used by the host
     * or other guests.
786
     */
787
    if ((conflict = virPCIDeviceBusContainsActiveDevices(dev, inactiveDevs))) {
788
        virReportError(VIR_ERR_INTERNAL_ERROR,
789 790
                       _("Active %s devices on bus with %s, not doing bus reset"),
                       conflict->name, dev->name);
791 792 793 794
        return -1;
    }

    /* Find the parent bus */
795
    if (virPCIDeviceGetParent(dev, &parent) < 0)
796
        return -1;
797
    if (!parent) {
798
        virReportError(VIR_ERR_INTERNAL_ERROR,
799 800
                       _("Failed to find parent device for %s"),
                       dev->name);
801 802
        return -1;
    }
803
    if ((parentfd = virPCIDeviceConfigOpen(parent, true)) < 0)
804
        goto out;
805 806 807 808 809 810 811

    VIR_DEBUG("%s %s: doing a secondary bus reset", dev->id, dev->name);

    /* Save and restore the device's config space; we only do this
     * for the supplied device since we refuse to do a reset if there
     * are multiple devices/functions
     */
812
    if (virPCIDeviceRead(dev, cfgfd, 0, config_space, PCI_CONF_LEN) < 0) {
813
        virReportError(VIR_ERR_INTERNAL_ERROR,
814
                       _("Failed to read PCI config space for %s"),
815
                       dev->name);
816 817 818 819 820 821
        goto out;
    }

    /* Read the control register, set the reset flag, wait 200ms,
     * unset the reset flag and wait 200ms.
     */
822
    ctl = virPCIDeviceRead16(dev, cfgfd, PCI_BRIDGE_CONTROL);
823

824 825
    virPCIDeviceWrite16(parent, parentfd, PCI_BRIDGE_CONTROL,
                        ctl | PCI_BRIDGE_CTL_RESET);
826 827 828

    usleep(200 * 1000); /* sleep 200ms */

829
    virPCIDeviceWrite16(parent, parentfd, PCI_BRIDGE_CONTROL, ctl);
830 831 832

    usleep(200 * 1000); /* sleep 200ms */

833
    if (virPCIDeviceWrite(dev, cfgfd, 0, config_space, PCI_CONF_LEN) < 0) {
834
        virReportError(VIR_ERR_INTERNAL_ERROR,
835 836 837 838
                       _("Failed to restore PCI config space for %s"),
                       dev->name);
        goto out;
    }
839
    ret = 0;
840

841
 out:
842 843
    virPCIDeviceConfigClose(parent, parentfd);
    virPCIDeviceFree(parent);
844 845 846 847 848 849 850 851
    return ret;
}

/* Power management reset attempts to reset a device using a
 * D-state transition from D3hot to D0. Note, in detect_pm_reset()
 * above we require the device supports a full internal reset.
 */
static int
852
virPCIDeviceTryPowerManagementReset(virPCIDevicePtr dev, int cfgfd)
853 854 855 856 857 858 859 860
{
    uint8_t config_space[PCI_CONF_LEN];
    uint32_t ctl;

    if (!dev->pci_pm_cap_pos)
        return -1;

    /* Save and restore the device's config space. */
861
    if (virPCIDeviceRead(dev, cfgfd, 0, &config_space[0], PCI_CONF_LEN) < 0) {
862
        virReportError(VIR_ERR_INTERNAL_ERROR,
863
                       _("Failed to read PCI config space for %s"),
864
                       dev->name);
865 866 867 868 869
        return -1;
    }

    VIR_DEBUG("%s %s: doing a power management reset", dev->id, dev->name);

870
    ctl = virPCIDeviceRead32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL);
871 872
    ctl &= ~PCI_PM_CTRL_STATE_MASK;

873 874
    virPCIDeviceWrite32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL,
                        ctl | PCI_PM_CTRL_STATE_D3hot);
875 876 877

    usleep(10 * 1000); /* sleep 10ms */

878 879
    virPCIDeviceWrite32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL,
                        ctl | PCI_PM_CTRL_STATE_D0);
880 881 882

    usleep(10 * 1000); /* sleep 10ms */

883
    if (virPCIDeviceWrite(dev, cfgfd, 0, &config_space[0], PCI_CONF_LEN) < 0) {
884
        virReportError(VIR_ERR_INTERNAL_ERROR,
885 886 887 888
                       _("Failed to restore PCI config space for %s"),
                       dev->name);
        return -1;
    }
889 890 891 892 893

    return 0;
}

static int
894
virPCIDeviceInit(virPCIDevicePtr dev, int cfgfd)
895
{
896 897
    int flr;

898 899 900
    dev->pcie_cap_pos   = virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_EXP);
    dev->pci_pm_cap_pos = virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_PM);
    flr = virPCIDeviceDetectFunctionLevelReset(dev, cfgfd);
901
    if (flr < 0)
902
        return flr;
903 904
    dev->has_flr        = !!flr;
    dev->has_pm_reset   = !!virPCIDeviceDetectPowerManagementReset(dev, cfgfd);
905

906 907 908 909
    return 0;
}

int
910 911 912
virPCIDeviceReset(virPCIDevicePtr dev,
                  virPCIDeviceList *activeDevs,
                  virPCIDeviceList *inactiveDevs)
913
{
914 915
    char *drvPath = NULL;
    char *drvName = NULL;
916
    int ret = -1;
917
    int fd = -1;
918

919
    if (activeDevs && virPCIDeviceListFind(activeDevs, dev)) {
920
        virReportError(VIR_ERR_INTERNAL_ERROR,
921 922 923 924
                       _("Not resetting active device %s"), dev->name);
        return -1;
    }

925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940
    /* If the device is currently bound to vfio-pci, ignore all
     * requests to reset it, since the vfio-pci driver will always
     * reset it whenever appropriate, so doing it ourselves would just
     * be redundant.
     */
    if (virPCIDeviceGetDriverPathAndName(dev, &drvPath, &drvName) < 0)
        goto cleanup;

    if (STREQ_NULLABLE(drvName, "vfio-pci")) {
        VIR_DEBUG("Device %s is bound to vfio-pci - skip reset",
                  dev->name);
        ret = 0;
        goto cleanup;
    }
    VIR_DEBUG("Resetting device %s", dev->name);

941
    if ((fd = virPCIDeviceConfigOpen(dev, true)) < 0)
942
        goto cleanup;
943

944
    if (virPCIDeviceInit(dev, fd) < 0)
945 946
        goto cleanup;

947 948 949
    /* KVM will perform FLR when starting and stopping
     * a guest, so there is no need for us to do it here.
     */
950 951 952 953
    if (dev->has_flr) {
        ret = 0;
        goto cleanup;
    }
954

955 956 957 958 959
    /* If the device supports PCI power management reset,
     * that's the next best thing because it only resets
     * the function, not the whole device.
     */
    if (dev->has_pm_reset)
960
        ret = virPCIDeviceTryPowerManagementReset(dev, fd);
961

962
    /* Bus reset is not an option with the root bus */
963
    if (ret < 0 && dev->bus != 0)
964
        ret = virPCIDeviceTrySecondaryBusReset(dev, fd, inactiveDevs);
965

966 967
    if (ret < 0) {
        virErrorPtr err = virGetLastError();
968
        virReportError(VIR_ERR_INTERNAL_ERROR,
969 970
                       _("Unable to reset PCI device %s: %s"),
                       dev->name,
971 972
                       err ? err->message :
                       _("no FLR, PM reset or bus reset available"));
973 974
    }

975
 cleanup:
976 977
    VIR_FREE(drvPath);
    VIR_FREE(drvName);
978
    virPCIDeviceConfigClose(dev, fd);
979 980 981
    return ret;
}

982

983
static int
984
virPCIProbeStubDriver(const char *driver)
985
{
986
    char *drvpath = NULL;
987
    bool probed = false;
988

989
 recheck:
990
    if (virPCIDriverDir(&drvpath, driver) == 0 && virFileExists(drvpath)) {
991
        /* driver already loaded, return */
992
        VIR_FREE(drvpath);
993
        return 0;
994 995 996
    }

    VIR_FREE(drvpath);
997 998

    if (!probed) {
999
        char *errbuf = NULL;
1000
        probed = true;
1001 1002 1003 1004
        if ((errbuf = virKModLoad(driver, true))) {
            VIR_WARN("failed to load driver %s: %s", driver, errbuf);
            VIR_FREE(errbuf);
            goto cleanup;
1005
        }
1006 1007

        goto recheck;
1008 1009
    }

1010
 cleanup:
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
    /* If we know failure was because of blacklist, let's report that;
     * otherwise, report a more generic failure message
     */
    if (virKModIsBlacklisted(driver)) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to load PCI stub module %s: "
                         "administratively prohibited"),
                       driver);
    } else {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to load PCI stub module %s"),
                       driver);
    }

1025
    return -1;
1026 1027
}

1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
int
virPCIDeviceUnbind(virPCIDevicePtr dev, bool reprobe)
{
    char *path = NULL;
    char *drvpath = NULL;
    char *driver = NULL;
    int ret = -1;

    if (virPCIDeviceGetDriverPathAndName(dev, &drvpath, &driver) < 0)
        goto cleanup;

    if (!driver) {
        /* The device is not bound to any driver */
        ret = 0;
        goto cleanup;
    }

    if (virPCIFile(&path, dev->name, "driver/unbind") < 0)
        goto cleanup;

    if (virFileExists(path)) {
        if (virFileWriteStr(path, dev->name, 0) < 0) {
            virReportSystemError(errno,
                                 _("Failed to unbind PCI device '%s' from %s"),
                                 dev->name, driver);
            goto cleanup;
        }
        dev->reprobe = reprobe;
    }

    ret = 0;
1059
 cleanup:
1060 1061 1062 1063 1064 1065
    VIR_FREE(path);
    VIR_FREE(drvpath);
    VIR_FREE(driver);
    return ret;
}

1066 1067 1068 1069 1070 1071 1072
static const char *virPCIKnownStubs[] = {
    "pciback",  /* used by xen */
    "pci-stub", /* used by kvm legacy passthrough */
    "vfio-pci", /* used by VFIO device assignment */
    NULL
};

1073
static int
1074
virPCIDeviceUnbindFromStub(virPCIDevicePtr dev)
1075 1076 1077 1078
{
    int result = -1;
    char *drvdir = NULL;
    char *path = NULL;
1079 1080 1081
    char *driver = NULL;
    const char **stubTest;
    bool isStub = false;
1082

1083 1084 1085
    /* If the device is currently bound to one of the "well known"
     * stub drivers, then unbind it, otherwise ignore it.
     */
L
Laine Stump 已提交
1086
    if (virPCIDeviceGetDriverPathAndName(dev, &drvdir, &driver) < 0)
1087
        goto cleanup;
E
Eric Blake 已提交
1088

1089 1090 1091 1092 1093
    if (!driver) {
        /* The device is not bound to any driver and we are almost done. */
        goto reprobe;
    }

1094 1095 1096
    if (!dev->unbind_from_stub)
        goto remove_slot;

1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
    /* If the device isn't bound to a known stub, skip the unbind. */
    for (stubTest = virPCIKnownStubs; *stubTest != NULL; stubTest++) {
        if (STREQ(driver, *stubTest)) {
            isStub = true;
            VIR_DEBUG("Found stub driver %s", *stubTest);
            break;
        }
    }
    if (!isStub)
        goto remove_slot;
1107

1108 1109
    if (virPCIDeviceUnbind(dev, dev->reprobe) < 0)
        goto cleanup;
1110
    dev->unbind_from_stub = false;
1111

1112
 remove_slot:
1113 1114
    if (!dev->remove_slot)
        goto reprobe;
1115 1116

    /* Xen's pciback.ko wants you to use remove_slot on the specific device */
1117
    if (virPCIDriverFile(&path, driver, "remove_slot") < 0) {
1118 1119 1120 1121 1122
        goto cleanup;
    }

    if (virFileExists(path) && virFileWriteStr(path, dev->name, 0) < 0) {
        virReportSystemError(errno,
1123
                             _("Failed to remove slot for PCI device '%s' from %s"),
1124 1125 1126
                             dev->name, driver);
        goto cleanup;
    }
1127
    dev->remove_slot = false;
1128

1129
 reprobe:
1130 1131 1132 1133
    if (!dev->reprobe) {
        result = 0;
        goto cleanup;
    }
1134 1135 1136 1137 1138 1139

    /* Trigger a re-probe of the device is not in the stub's dynamic
     * ID table. If the stub is available, but 'remove_id' isn't
     * available, then re-probing would just cause the device to be
     * re-bound to the stub.
     */
1140
    if (driver && virPCIDriverFile(&path, driver, "remove_id") < 0)
1141 1142
        goto cleanup;

1143
    if (!driver || !virFileExists(drvdir) || virFileExists(path)) {
1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
        if (virFileWriteStr(PCI_SYSFS "drivers_probe", dev->name, 0) < 0) {
            virReportSystemError(errno,
                                 _("Failed to trigger a re-probe for PCI device '%s'"),
                                 dev->name);
            goto cleanup;
        }
    }

    result = 0;

1154
 cleanup:
1155
    /* do not do it again */
1156 1157 1158
    dev->unbind_from_stub = false;
    dev->remove_slot = false;
    dev->reprobe = false;
1159

1160 1161
    VIR_FREE(drvdir);
    VIR_FREE(path);
1162
    VIR_FREE(driver);
1163 1164 1165 1166

    return result;
}

1167 1168

static int
1169 1170
virPCIDeviceBindToStub(virPCIDevicePtr dev,
                       const char *stubDriverName)
1171
{
1172
    int result = -1;
1173
    int reprobe = false;
1174 1175 1176
    char *stubDriverPath = NULL;
    char *driverLink = NULL;
    char *path = NULL; /* reused for different purposes */
J
Jiri Denemark 已提交
1177 1178
    char *newDriverName = NULL;
    virErrorPtr err = NULL;
1179 1180

    if (virPCIDriverDir(&stubDriverPath, stubDriverName) < 0 ||
J
Jiri Denemark 已提交
1181 1182
        virPCIFile(&driverLink, dev->name, "driver") < 0 ||
        VIR_STRDUP(newDriverName, stubDriverName) < 0)
1183 1184
        goto cleanup;

1185 1186 1187 1188 1189
    if (virFileExists(driverLink)) {
        if (virFileLinkPointsTo(driverLink, stubDriverPath)) {
            /* The device is already bound to the correct driver */
            VIR_DEBUG("Device %s is already bound to %s",
                      dev->name, stubDriverName);
1190 1191 1192
            result = 0;
            goto cleanup;
        }
1193
        reprobe = true;
1194
    }
1195 1196 1197 1198 1199 1200 1201 1202 1203

    /* Add the PCI device ID to the stub's dynamic ID table;
     * this is needed to allow us to bind the device to the stub.
     * Note: if the device is not currently bound to any driver,
     * stub will immediately be bound to the device. Also, note
     * that if a new device with this ID is hotplugged, or if a probe
     * is triggered for such a device, it will also be immediately
     * bound by the stub.
     */
1204
    if (virPCIDriverFile(&path, stubDriverName, "new_id") < 0) {
1205
        goto cleanup;
1206 1207
    }

1208
    if (virFileWriteStr(path, dev->id, 0) < 0) {
1209
        virReportSystemError(errno,
1210
                             _("Failed to add PCI device ID '%s' to %s"),
1211
                             dev->id, stubDriverName);
1212
        goto cleanup;
1213 1214
    }

1215
    /* check whether the device is bound to pci-stub when we write dev->id to
1216
     * ${stubDriver}/new_id.
1217
     */
1218
    if (virFileLinkPointsTo(driverLink, stubDriverPath)) {
1219 1220
        dev->unbind_from_stub = true;
        dev->remove_slot = true;
J
Jiri Denemark 已提交
1221
        result = 0;
1222 1223 1224
        goto remove_id;
    }

1225
    if (virPCIDeviceUnbind(dev, reprobe) < 0)
J
Jiri Denemark 已提交
1226
        goto remove_id;
1227

1228 1229
    /* If the device isn't already bound to pci-stub, try binding it now.
     */
1230
    if (!virFileLinkPointsTo(driverLink, stubDriverPath)) {
1231
        /* Xen's pciback.ko wants you to use new_slot first */
1232
        if (virPCIDriverFile(&path, stubDriverName, "new_slot") < 0) {
1233
            goto remove_id;
1234 1235
        }

1236
        if (virFileExists(path) && virFileWriteStr(path, dev->name, 0) < 0) {
1237
            virReportSystemError(errno,
1238 1239 1240
                                 _("Failed to add slot for "
                                   "PCI device '%s' to %s"),
                                 dev->name, stubDriverName);
1241
            goto remove_id;
1242
        }
1243
        dev->remove_slot = true;
1244

1245
        if (virPCIDriverFile(&path, stubDriverName, "bind") < 0) {
1246
            goto remove_id;
1247 1248
        }

1249
        if (virFileWriteStr(path, dev->name, 0) < 0) {
1250
            virReportSystemError(errno,
1251
                                 _("Failed to bind PCI device '%s' to %s"),
1252
                                 dev->name, stubDriverName);
1253
            goto remove_id;
1254
        }
1255
        dev->unbind_from_stub = true;
1256 1257
    }

J
Jiri Denemark 已提交
1258 1259
    result = 0;

1260
 remove_id:
J
Jiri Denemark 已提交
1261 1262
    err = virSaveLastError();

1263 1264 1265
    /* If 'remove_id' exists, remove the device id from pci-stub's dynamic
     * ID table so that 'drivers_probe' works below.
     */
1266
    if (virPCIDriverFile(&path, stubDriverName, "remove_id") < 0) {
E
Eric Blake 已提交
1267
        /* We do not remove PCI ID from pci-stub, and we cannot reprobe it */
1268 1269
        if (dev->reprobe) {
            VIR_WARN("Could not remove PCI ID '%s' from %s, and the device "
1270
                     "cannot be probed again.", dev->id, stubDriverName);
1271
        }
1272
        dev->reprobe = false;
J
Jiri Denemark 已提交
1273
        result = -1;
1274 1275 1276
        goto cleanup;
    }

1277
    if (virFileExists(path) && virFileWriteStr(path, dev->id, 0) < 0) {
1278
        virReportSystemError(errno,
1279
                             _("Failed to remove PCI ID '%s' from %s"),
1280
                             dev->id, stubDriverName);
1281

E
Eric Blake 已提交
1282
        /* remove PCI ID from pci-stub failed, and we cannot reprobe it */
1283 1284
        if (dev->reprobe) {
            VIR_WARN("Failed to remove PCI ID '%s' from %s, and the device "
1285
                     "cannot be probed again.", dev->id, stubDriverName);
1286
        }
1287
        dev->reprobe = false;
J
Jiri Denemark 已提交
1288
        result = -1;
1289
        goto cleanup;
1290 1291
    }

1292
 cleanup:
1293 1294
    VIR_FREE(stubDriverPath);
    VIR_FREE(driverLink);
1295 1296
    VIR_FREE(path);

J
Jiri Denemark 已提交
1297 1298 1299 1300
    if (result < 0) {
        VIR_FREE(newDriverName);
        virPCIDeviceUnbindFromStub(dev);
    } else {
1301
        VIR_FREE(dev->stubDriver);
J
Jiri Denemark 已提交
1302
        dev->stubDriver = newDriverName;
1303
    }
J
Jiri Denemark 已提交
1304 1305 1306 1307

    if (err)
        virSetError(err);
    virFreeError(err);
1308

1309
    return result;
1310 1311
}

1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329
/* virPCIDeviceDetach:
 *
 * Detach this device from the host driver, attach it to the stub
 * driver (previously set with virPCIDeviceSetStubDriver(), and add *a
 * copy* of the object to the inactiveDevs list (if provided). This
 * function will *never* consume dev, so the caller should free it.
 *
 * Returns 0 on success, -1 on failure (will fail if the device is
 * already in the activeDevs list, but will be a NOP if the device is
 * already bound to the stub).
 *
 * GENERAL NOTE: activeDevs should be a list of all PCI devices
 * currently in use by a domain. inactiveDevs is a list of all PCI
 * devices that libvirt has detached from the host driver + attached
 * to the stub driver, but hasn't yet assigned to a domain. Any device
 * that is still attached to its host driver should not be on either
 * list.
 */
1330
int
1331 1332
virPCIDeviceDetach(virPCIDevicePtr dev,
                   virPCIDeviceList *activeDevs,
1333
                   virPCIDeviceList *inactiveDevs)
1334
{
J
John Ferlan 已提交
1335 1336
    sa_assert(dev->stubDriver);

1337
    if (virPCIProbeStubDriver(dev->stubDriver) < 0)
1338 1339
        return -1;

1340
    if (activeDevs && virPCIDeviceListFind(activeDevs, dev)) {
1341
        virReportError(VIR_ERR_INTERNAL_ERROR,
1342 1343 1344 1345
                       _("Not detaching active device %s"), dev->name);
        return -1;
    }

1346
    if (virPCIDeviceBindToStub(dev, dev->stubDriver) < 0)
1347 1348
        return -1;

1349 1350 1351
    /* Add *a copy of* the dev into list inactiveDevs, if
     * it's not already there.
     */
1352 1353 1354
    if (inactiveDevs && !virPCIDeviceListFind(inactiveDevs, dev) &&
        virPCIDeviceListAddCopy(inactiveDevs, dev) < 0) {
        return -1;
1355 1356 1357
    }

    return 0;
1358 1359 1360
}

int
1361 1362
virPCIDeviceReattach(virPCIDevicePtr dev,
                     virPCIDeviceListPtr activeDevs,
1363
                     virPCIDeviceListPtr inactiveDevs)
1364
{
1365
    if (activeDevs && virPCIDeviceListFind(activeDevs, dev)) {
1366
        virReportError(VIR_ERR_INTERNAL_ERROR,
1367 1368 1369 1370
                       _("Not reattaching active device %s"), dev->name);
        return -1;
    }

1371
    if (virPCIDeviceUnbindFromStub(dev) < 0)
1372 1373 1374 1375
        return -1;

    /* Steal the dev from list inactiveDevs */
    if (inactiveDevs)
1376
        virPCIDeviceListDel(inactiveDevs, dev);
1377 1378

    return 0;
1379 1380
}

1381 1382 1383 1384 1385
/* Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
 * the host when doing device passthrough.  This can lead to a race
 * condition where the hypervisor is still cleaning up the device while
 * libvirt is trying to re-attach it to the host device driver.  To avoid
 * this situation, we look through /proc/iomem, and if the hypervisor is
E
Eric Blake 已提交
1386 1387
 * still holding on to the bar (denoted by the string in the matcher
 * variable), then we can wait around a bit for that to clear up.
1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407
 *
 * A typical /proc/iomem looks like this (snipped for brevity):
 * 00010000-0008efff : System RAM
 * 0008f000-0008ffff : reserved
 * ...
 * 00100000-cc9fcfff : System RAM
 *   00200000-00483d3b : Kernel code
 *   00483d3c-005c88df : Kernel data
 * cc9fd000-ccc71fff : ACPI Non-volatile Storage
 * ...
 * d0200000-d02fffff : PCI Bus #05
 *   d0200000-d021ffff : 0000:05:00.0
 *     d0200000-d021ffff : e1000e
 *   d0220000-d023ffff : 0000:05:00.0
 *     d0220000-d023ffff : e1000e
 * ...
 * f0000000-f0003fff : 0000:00:1b.0
 *   f0000000-f0003fff : kvm_assigned_device
 *
 * Returns 0 if we are clear to continue, and 1 if the hypervisor is still
E
Eric Blake 已提交
1408
 * holding on to the resource.
1409 1410
 */
int
1411
virPCIDeviceWaitForCleanup(virPCIDevicePtr dev, const char *matcher)
1412 1413 1414
{
    FILE *fp;
    char line[160];
1415
    char *tmp;
1416
    unsigned long long start, end;
1417
    unsigned int domain, bus, slot, function;
1418
    bool in_matching_device;
1419 1420 1421 1422 1423 1424 1425 1426 1427
    int ret;
    size_t match_depth;

    fp = fopen("/proc/iomem", "r");
    if (!fp) {
        /* If we failed to open iomem, we just basically ignore the error.  The
         * unbind might succeed anyway, and besides, it's very likely we have
         * no way to report the error
         */
1428
        VIR_DEBUG("Failed to open /proc/iomem, trying to continue anyway");
1429 1430 1431 1432
        return 0;
    }

    ret = 0;
1433
    in_matching_device = false;
1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444
    match_depth = 0;
    while (fgets(line, sizeof(line), fp) != 0) {
        /* the logic here is a bit confusing.  For each line, we look to
         * see if it matches the domain:bus:slot.function we were given.
         * If this line matches the DBSF, then any subsequent lines indented
         * by 2 spaces are the PCI regions for this device.  It's also
         * possible that none of the PCI regions are currently mapped, in
         * which case we have no indented regions.  This code handles all
         * of these situations
         */
        if (in_matching_device && (strspn(line, " ") == (match_depth + 2))) {
1445 1446 1447 1448 1449 1450
            /* expected format: <start>-<end> : <suffix> */
            if (/* start */
                virStrToLong_ull(line, &tmp, 16, &start) < 0 || *tmp != '-' ||
                /* end */
                virStrToLong_ull(tmp + 1, &tmp, 16, &end) < 0 ||
                (tmp = STRSKIP(tmp, " : ")) == NULL)
1451 1452
                continue;

1453
            if (STRPREFIX(tmp, matcher)) {
1454 1455 1456 1457 1458
                ret = 1;
                break;
            }
        }
        else {
1459
            in_matching_device = false;
1460

1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474
            /* expected format: <start>-<end> : <domain>:<bus>:<slot>.<function> */
            if (/* start */
                virStrToLong_ull(line, &tmp, 16, &start) < 0 || *tmp != '-' ||
                /* end */
                virStrToLong_ull(tmp + 1, &tmp, 16, &end) < 0 ||
                (tmp = STRSKIP(tmp, " : ")) == NULL ||
                /* domain */
                virStrToLong_ui(tmp, &tmp, 16, &domain) < 0 || *tmp != ':' ||
                /* bus */
                virStrToLong_ui(tmp + 1, &tmp, 16, &bus) < 0 || *tmp != ':' ||
                /* slot */
                virStrToLong_ui(tmp + 1, &tmp, 16, &slot) < 0 || *tmp != '.' ||
                /* function */
                virStrToLong_ui(tmp + 1, &tmp, 16, &function) < 0 || *tmp != '\n')
1475 1476 1477 1478 1479
                continue;

            if (domain != dev->domain || bus != dev->bus || slot != dev->slot ||
                function != dev->function)
                continue;
1480
            in_matching_device = true;
1481 1482 1483 1484
            match_depth = strspn(line, " ");
        }
    }

E
Eric Blake 已提交
1485
    VIR_FORCE_FCLOSE(fp);
1486 1487 1488 1489

    return ret;
}

1490
static char *
1491
virPCIDeviceReadID(virPCIDevicePtr dev, const char *id_name)
1492
{
1493
    char *path = NULL;
1494 1495
    char *id_str;

1496
    if (virPCIFile(&path, dev->name, id_name) < 0) {
1497 1498
        return NULL;
    }
1499 1500

    /* ID string is '0xNNNN\n' ... i.e. 7 bytes */
1501 1502
    if (virFileReadAll(path, 7, &id_str) < 0) {
        VIR_FREE(path);
1503
        return NULL;
1504 1505 1506
    }

    VIR_FREE(path);
1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519

    /* Check for 0x suffix */
    if (id_str[0] != '0' || id_str[1] != 'x') {
        VIR_FREE(id_str);
        return NULL;
    }

    /* Chop off the newline; we know the string is 7 bytes */
    id_str[6] = '\0';

    return id_str;
}

1520
int
1521 1522 1523 1524
virPCIGetAddrString(unsigned int domain,
                    unsigned int bus,
                    unsigned int slot,
                    unsigned int function,
1525
                    char **pciConfigAddr)
1526
{
1527
    virPCIDevicePtr dev = NULL;
1528 1529
    int ret = -1;

1530
    dev = virPCIDeviceNew(domain, bus, slot, function);
1531
    if (dev != NULL) {
1532
        if (VIR_STRDUP(*pciConfigAddr, dev->name) < 0)
1533 1534 1535 1536
            goto cleanup;
        ret = 0;
    }

1537
 cleanup:
1538
    virPCIDeviceFree(dev);
1539 1540 1541
    return ret;
}

1542
virPCIDevicePtr
1543 1544 1545 1546
virPCIDeviceNew(unsigned int domain,
                unsigned int bus,
                unsigned int slot,
                unsigned int function)
1547
{
1548
    virPCIDevicePtr dev;
E
Eric Blake 已提交
1549 1550
    char *vendor = NULL;
    char *product = NULL;
1551

1552
    if (VIR_ALLOC(dev) < 0)
1553 1554 1555 1556 1557 1558 1559
        return NULL;

    dev->domain   = domain;
    dev->bus      = bus;
    dev->slot     = slot;
    dev->function = function;

E
Eric Blake 已提交
1560 1561 1562
    if (snprintf(dev->name, sizeof(dev->name), "%.4x:%.2x:%.2x.%.1x",
                 dev->domain, dev->bus, dev->slot,
                 dev->function) >= sizeof(dev->name)) {
1563
        virReportError(VIR_ERR_INTERNAL_ERROR,
E
Eric Blake 已提交
1564 1565
                       _("dev->name buffer overflow: %.4x:%.2x:%.2x.%.1x"),
                       dev->domain, dev->bus, dev->slot, dev->function);
E
Eric Blake 已提交
1566
        goto error;
E
Eric Blake 已提交
1567 1568
    }
    if (virAsprintf(&dev->path, PCI_SYSFS "devices/%s/config",
1569
                    dev->name) < 0)
E
Eric Blake 已提交
1570
        goto error;
1571

1572
    if (!virFileExists(dev->path)) {
1573 1574 1575
        virReportSystemError(errno,
                             _("Device %s not found: could not access %s"),
                             dev->name, dev->path);
E
Eric Blake 已提交
1576
        goto error;
1577 1578
    }

1579 1580
    vendor  = virPCIDeviceReadID(dev, "vendor");
    product = virPCIDeviceReadID(dev, "device");
1581 1582

    if (!vendor || !product) {
1583
        virReportError(VIR_ERR_INTERNAL_ERROR,
1584 1585
                       _("Failed to read product/vendor ID for %s"),
                       dev->name);
E
Eric Blake 已提交
1586
        goto error;
1587 1588 1589
    }

    /* strings contain '0x' prefix */
E
Eric Blake 已提交
1590 1591
    if (snprintf(dev->id, sizeof(dev->id), "%s %s", &vendor[2],
                 &product[2]) >= sizeof(dev->id)) {
1592
        virReportError(VIR_ERR_INTERNAL_ERROR,
E
Eric Blake 已提交
1593 1594
                       _("dev->id buffer overflow: %s %s"),
                       &vendor[2], &product[2]);
E
Eric Blake 已提交
1595
        goto error;
E
Eric Blake 已提交
1596
    }
1597 1598 1599

    VIR_DEBUG("%s %s: initialized", dev->id, dev->name);

1600
 cleanup:
E
Eric Blake 已提交
1601 1602
    VIR_FREE(product);
    VIR_FREE(vendor);
1603
    return dev;
E
Eric Blake 已提交
1604

1605
 error:
1606
    virPCIDeviceFree(dev);
E
Eric Blake 已提交
1607 1608
    dev = NULL;
    goto cleanup;
1609 1610
}

L
Laine Stump 已提交
1611 1612 1613 1614 1615 1616

virPCIDevicePtr
virPCIDeviceCopy(virPCIDevicePtr dev)
{
    virPCIDevicePtr copy;

1617
    if (VIR_ALLOC(copy) < 0)
L
Laine Stump 已提交
1618 1619 1620 1621 1622
        return NULL;

    /* shallow copy to take care of most attributes */
    *copy = *dev;
    copy->path = copy->stubDriver = NULL;
C
Chunyan Liu 已提交
1623
    copy->used_by_drvname = copy->used_by_domname = NULL;
L
Laine Stump 已提交
1624
    if (VIR_STRDUP(copy->path, dev->path) < 0 ||
C
Chunyan Liu 已提交
1625 1626 1627
        VIR_STRDUP(copy->stubDriver, dev->stubDriver) < 0 ||
        VIR_STRDUP(copy->used_by_drvname, dev->used_by_drvname) < 0 ||
        VIR_STRDUP(copy->used_by_domname, dev->used_by_domname) < 0) {
L
Laine Stump 已提交
1628 1629 1630 1631
        goto error;
    }
    return copy;

1632
 error:
L
Laine Stump 已提交
1633 1634 1635 1636 1637
    virPCIDeviceFree(copy);
    return NULL;
}


1638
void
1639
virPCIDeviceFree(virPCIDevicePtr dev)
1640
{
1641 1642
    if (!dev)
        return;
1643
    VIR_DEBUG("%s %s: freeing", dev->id, dev->name);
E
Eric Blake 已提交
1644
    VIR_FREE(dev->path);
1645
    VIR_FREE(dev->stubDriver);
C
Chunyan Liu 已提交
1646 1647
    VIR_FREE(dev->used_by_drvname);
    VIR_FREE(dev->used_by_domname);
1648 1649
    VIR_FREE(dev);
}
1650

1651
const char *
1652
virPCIDeviceGetName(virPCIDevicePtr dev)
1653 1654 1655 1656
{
    return dev->name;
}

1657
void virPCIDeviceSetManaged(virPCIDevicePtr dev, bool managed)
1658
{
1659
    dev->managed = managed;
1660 1661
}

1662 1663
unsigned int
virPCIDeviceGetManaged(virPCIDevicePtr dev)
1664 1665 1666 1667
{
    return dev->managed;
}

1668
int
1669 1670
virPCIDeviceSetStubDriver(virPCIDevicePtr dev, const char *driver)
{
1671
    VIR_FREE(dev->stubDriver);
J
John Ferlan 已提交
1672
    return VIR_STRDUP(dev->stubDriver, driver);
1673 1674 1675 1676 1677 1678 1679 1680
}

const char *
virPCIDeviceGetStubDriver(virPCIDevicePtr dev)
{
    return dev->stubDriver;
}

1681
unsigned int
1682
virPCIDeviceGetUnbindFromStub(virPCIDevicePtr dev)
1683 1684 1685 1686 1687
{
    return dev->unbind_from_stub;
}

void
1688
virPCIDeviceSetUnbindFromStub(virPCIDevicePtr dev, bool unbind)
1689
{
1690
    dev->unbind_from_stub = unbind;
1691 1692
}

1693
unsigned int
1694
virPCIDeviceGetRemoveSlot(virPCIDevicePtr dev)
1695 1696 1697 1698 1699
{
    return dev->remove_slot;
}

void
1700
virPCIDeviceSetRemoveSlot(virPCIDevicePtr dev, bool remove_slot)
1701
{
1702
    dev->remove_slot = remove_slot;
1703 1704
}

1705
unsigned int
1706
virPCIDeviceGetReprobe(virPCIDevicePtr dev)
1707 1708 1709 1710 1711
{
    return dev->reprobe;
}

void
1712
virPCIDeviceSetReprobe(virPCIDevicePtr dev, bool reprobe)
1713
{
1714
    dev->reprobe = reprobe;
1715 1716
}

C
Chunyan Liu 已提交
1717 1718 1719 1720
int
virPCIDeviceSetUsedBy(virPCIDevicePtr dev,
                      const char *drv_name,
                      const char *dom_name)
1721
{
C
Chunyan Liu 已提交
1722 1723 1724 1725 1726 1727 1728 1729
    VIR_FREE(dev->used_by_drvname);
    VIR_FREE(dev->used_by_domname);
    if (VIR_STRDUP(dev->used_by_drvname, drv_name) < 0)
        return -1;
    if (VIR_STRDUP(dev->used_by_domname, dom_name) < 0)
        return -1;

    return 0;
1730 1731
}

C
Chunyan Liu 已提交
1732 1733 1734 1735
void
virPCIDeviceGetUsedBy(virPCIDevicePtr dev,
                      const char **drv_name,
                      const char **dom_name)
1736
{
C
Chunyan Liu 已提交
1737 1738
    *drv_name = dev->used_by_drvname;
    *dom_name = dev->used_by_domname;
1739 1740
}

1741
void virPCIDeviceReattachInit(virPCIDevicePtr pci)
1742
{
1743 1744 1745
    pci->unbind_from_stub = true;
    pci->remove_slot = true;
    pci->reprobe = true;
1746 1747 1748
}


1749 1750
virPCIDeviceListPtr
virPCIDeviceListNew(void)
1751
{
1752
    virPCIDeviceListPtr list;
1753

1754 1755 1756 1757
    if (virPCIInitialize() < 0)
        return NULL;

    if (!(list = virObjectLockableNew(virPCIDeviceListClass)))
1758 1759 1760 1761 1762
        return NULL;

    return list;
}

1763 1764
static void
virPCIDeviceListDispose(void *obj)
1765
{
1766
    virPCIDeviceListPtr list = obj;
1767
    size_t i;
1768 1769

    for (i = 0; i < list->count; i++) {
1770
        virPCIDeviceFree(list->devs[i]);
1771 1772 1773 1774 1775 1776 1777 1778
        list->devs[i] = NULL;
    }

    list->count = 0;
    VIR_FREE(list->devs);
}

int
1779 1780
virPCIDeviceListAdd(virPCIDeviceListPtr list,
                    virPCIDevicePtr dev)
1781
{
1782
    if (virPCIDeviceListFind(list, dev)) {
1783
        virReportError(VIR_ERR_INTERNAL_ERROR,
1784 1785 1786
                       _("Device %s is already in use"), dev->name);
        return -1;
    }
1787
    return VIR_APPEND_ELEMENT(list->devs, list->count, dev);
1788 1789
}

L
Laine Stump 已提交
1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806

/* virPCIDeviceListAddCopy - add a *copy* of the device to this list */
int
virPCIDeviceListAddCopy(virPCIDeviceListPtr list, virPCIDevicePtr dev)
{
    virPCIDevicePtr copy = virPCIDeviceCopy(dev);

    if (!copy)
        return -1;
    if (virPCIDeviceListAdd(list, copy) < 0) {
        virPCIDeviceFree(copy);
        return -1;
    }
    return 0;
}


1807 1808 1809
virPCIDevicePtr
virPCIDeviceListGet(virPCIDeviceListPtr list,
                    int idx)
1810 1811 1812 1813 1814 1815 1816 1817 1818
{
    if (idx >= list->count)
        return NULL;
    if (idx < 0)
        return NULL;

    return list->devs[idx];
}

1819
size_t
1820
virPCIDeviceListCount(virPCIDeviceListPtr list)
1821
{
1822 1823 1824
    return list->count;
}

1825 1826 1827
virPCIDevicePtr
virPCIDeviceListStealIndex(virPCIDeviceListPtr list,
                           int idx)
1828
{
1829
    virPCIDevicePtr ret;
1830

1831 1832
    if (idx < 0 || idx >= list->count)
        return NULL;
1833

1834
    ret = list->devs[idx];
1835
    VIR_DELETE_ELEMENT(list->devs, idx, list->count);
1836 1837 1838
    return ret;
}

1839 1840 1841
virPCIDevicePtr
virPCIDeviceListSteal(virPCIDeviceListPtr list,
                      virPCIDevicePtr dev)
1842
{
1843
    return virPCIDeviceListStealIndex(list, virPCIDeviceListFindIndex(list, dev));
1844 1845
}

1846
void
1847 1848
virPCIDeviceListDel(virPCIDeviceListPtr list,
                    virPCIDevicePtr dev)
1849
{
1850
    virPCIDevicePtr ret = virPCIDeviceListSteal(list, dev);
1851
    virPCIDeviceFree(ret);
1852 1853
}

1854
int
1855
virPCIDeviceListFindIndex(virPCIDeviceListPtr list, virPCIDevicePtr dev)
1856
{
1857
    size_t i;
1858 1859 1860 1861 1862 1863

    for (i = 0; i < list->count; i++)
        if (list->devs[i]->domain   == dev->domain &&
            list->devs[i]->bus      == dev->bus    &&
            list->devs[i]->slot     == dev->slot   &&
            list->devs[i]->function == dev->function)
1864 1865 1866 1867
            return i;
    return -1;
}

L
Laine Stump 已提交
1868 1869 1870 1871 1872 1873 1874 1875

virPCIDevicePtr
virPCIDeviceListFindByIDs(virPCIDeviceListPtr list,
                          unsigned int domain,
                          unsigned int bus,
                          unsigned int slot,
                          unsigned int function)
{
1876
    size_t i;
L
Laine Stump 已提交
1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888

    for (i = 0; i < list->count; i++) {
        if (list->devs[i]->domain == domain &&
            list->devs[i]->bus == bus &&
            list->devs[i]->slot == slot &&
            list->devs[i]->function == function)
            return list->devs[i];
    }
    return NULL;
}


1889 1890
virPCIDevicePtr
virPCIDeviceListFind(virPCIDeviceListPtr list, virPCIDevicePtr dev)
1891
{
1892
    int idx;
1893

1894 1895
    if ((idx = virPCIDeviceListFindIndex(list, dev)) >= 0)
        return list->devs[idx];
1896 1897
    else
        return NULL;
1898
}
1899 1900


1901 1902 1903
int virPCIDeviceFileIterate(virPCIDevicePtr dev,
                            virPCIDeviceFileActor actor,
                            void *opaque)
1904 1905 1906 1907 1908 1909 1910 1911
{
    char *pcidir = NULL;
    char *file = NULL;
    DIR *dir = NULL;
    int ret = -1;
    struct dirent *ent;

    if (virAsprintf(&pcidir, "/sys/bus/pci/devices/%04x:%02x:%02x.%x",
1912
                    dev->domain, dev->bus, dev->slot, dev->function) < 0)
1913 1914 1915
        goto cleanup;

    if (!(dir = opendir(pcidir))) {
1916
        virReportSystemError(errno,
1917 1918 1919 1920 1921 1922
                             _("cannot open %s"), pcidir);
        goto cleanup;
    }

    while ((ent = readdir(dir)) != NULL) {
        /* Device assignment requires:
A
Alex Williamson 已提交
1923 1924
         *   $PCIDIR/config, $PCIDIR/resource, $PCIDIR/resourceNNN,
         *   $PCIDIR/rom, $PCIDIR/reset
1925 1926 1927
         */
        if (STREQ(ent->d_name, "config") ||
            STRPREFIX(ent->d_name, "resource") ||
A
Alex Williamson 已提交
1928 1929
            STREQ(ent->d_name, "rom") ||
            STREQ(ent->d_name, "reset")) {
1930
            if (virAsprintf(&file, "%s/%s", pcidir, ent->d_name) < 0)
1931
                goto cleanup;
1932
            if ((actor)(dev, file, opaque) < 0)
1933 1934 1935 1936 1937 1938 1939 1940
                goto cleanup;

            VIR_FREE(file);
        }
    }

    ret = 0;

1941
 cleanup:
1942 1943 1944 1945 1946 1947
    if (dir)
        closedir(dir);
    VIR_FREE(file);
    VIR_FREE(pcidir);
    return ret;
}
J
Jiri Denemark 已提交
1948

L
Laine Stump 已提交
1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966

/* virPCIDeviceAddressIOMMUGroupIterate:
 *   Call @actor for all devices in the same iommu_group as orig
 *   (including orig itself) Even if there is no iommu_group for the
 *   device, call @actor once for orig.
 */
int
virPCIDeviceAddressIOMMUGroupIterate(virPCIDeviceAddressPtr orig,
                                     virPCIDeviceAddressActor actor,
                                     void *opaque)
{
    char *groupPath = NULL;
    DIR *groupDir = NULL;
    int ret = -1;
    struct dirent *ent;

    if (virAsprintf(&groupPath,
                    PCI_SYSFS "devices/%04x:%02x:%02x.%x/iommu_group/devices",
1967
                    orig->domain, orig->bus, orig->slot, orig->function) < 0)
L
Laine Stump 已提交
1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
        goto cleanup;

    if (!(groupDir = opendir(groupPath))) {
        /* just process the original device, nothing more */
        ret = (actor)(orig, opaque);
        goto cleanup;
    }

    while ((errno = 0, ent = readdir(groupDir)) != NULL) {
        virPCIDeviceAddress newDev;

        if (ent->d_name[0] == '.')
            continue;

        if (virPCIDeviceAddressParse(ent->d_name, &newDev) < 0) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Found invalid device link '%s' in '%s'"),
                           ent->d_name, groupPath);
            goto cleanup;
        }

        if ((actor)(&newDev, opaque) < 0)
            goto cleanup;
    }
    if (errno != 0) {
        virReportSystemError(errno,
                             _("Failed to read directory entry for %s"),
                             groupPath);
        goto cleanup;
    }

    ret = 0;

2001
 cleanup:
L
Laine Stump 已提交
2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
    VIR_FREE(groupPath);
    if (groupDir)
        closedir(groupDir);
    return ret;
}


static int
virPCIDeviceGetIOMMUGroupAddOne(virPCIDeviceAddressPtr newDevAddr, void *opaque)
{
    int ret = -1;
    virPCIDeviceListPtr groupList = opaque;
    virPCIDevicePtr newDev;

    if (!(newDev = virPCIDeviceNew(newDevAddr->domain, newDevAddr->bus,
                                   newDevAddr->slot, newDevAddr->function)))
        goto cleanup;

    if (virPCIDeviceListAdd(groupList, newDev) < 0)
        goto cleanup;

    newDev = NULL; /* it's now on the list */
    ret = 0;
2025
 cleanup:
L
Laine Stump 已提交
2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053
    virPCIDeviceFree(newDev);
    return ret;
}


/*
 * virPCIDeviceGetIOMMUGroupList - return a virPCIDeviceList containing
 * all of the devices in the same iommu_group as @dev.
 *
 * Return the new list, or NULL on failure
 */
virPCIDeviceListPtr
virPCIDeviceGetIOMMUGroupList(virPCIDevicePtr dev)
{
    virPCIDeviceListPtr groupList = virPCIDeviceListNew();
    virPCIDeviceAddress devAddr = { dev->domain, dev->bus,
                                    dev->slot, dev->function };

    if (!groupList)
        goto error;

    if (virPCIDeviceAddressIOMMUGroupIterate(&devAddr,
                                             virPCIDeviceGetIOMMUGroupAddOne,
                                             groupList) < 0)
        goto error;

    return groupList;

2054
 error:
L
Laine Stump 已提交
2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078
    virObjectUnref(groupList);
    return NULL;
}


typedef struct {
    virPCIDeviceAddressPtr **iommuGroupDevices;
    size_t *nIommuGroupDevices;
} virPCIDeviceAddressList;
typedef virPCIDeviceAddressList *virPCIDeviceAddressListPtr;

static int
virPCIGetIOMMUGroupAddressesAddOne(virPCIDeviceAddressPtr newDevAddr, void *opaque)
{
    int ret = -1;
    virPCIDeviceAddressListPtr addrList = opaque;
    virPCIDeviceAddressPtr copyAddr;

    /* make a copy to insert onto the list */
    if (VIR_ALLOC(copyAddr) < 0)
        goto cleanup;

    *copyAddr = *newDevAddr;

2079 2080
    if (VIR_APPEND_ELEMENT(*addrList->iommuGroupDevices,
                           *addrList->nIommuGroupDevices, copyAddr) < 0)
L
Laine Stump 已提交
2081 2082 2083
        goto cleanup;

    ret = 0;
2084
 cleanup:
L
Laine Stump 已提交
2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111
    VIR_FREE(copyAddr);
    return ret;
}


/*
 * virPCIDeviceAddressGetIOMMUGroupAddresses - return a
 * virPCIDeviceList containing all of the devices in the same
 * iommu_group as @dev.
 *
 * Return the new list, or NULL on failure
 */
int
virPCIDeviceAddressGetIOMMUGroupAddresses(virPCIDeviceAddressPtr devAddr,
                                          virPCIDeviceAddressPtr **iommuGroupDevices,
                                          size_t *nIommuGroupDevices)
{
    int ret = -1;
    virPCIDeviceAddressList addrList = { iommuGroupDevices,
                                         nIommuGroupDevices };

    if (virPCIDeviceAddressIOMMUGroupIterate(devAddr,
                                             virPCIGetIOMMUGroupAddressesAddOne,
                                             &addrList) < 0)
        goto cleanup;

    ret = 0;
2112
 cleanup:
L
Laine Stump 已提交
2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131
    return ret;
}


/* virPCIDeviceAddressGetIOMMUGroupNum - return the group number of
 * this PCI device's iommu_group, or -2 if there is no iommu_group for
 * the device (or -1 if there was any other error)
 */
int
virPCIDeviceAddressGetIOMMUGroupNum(virPCIDeviceAddressPtr addr)
{
    char *devName = NULL;
    char *devPath = NULL;
    char *groupPath = NULL;
    const char *groupNumStr;
    unsigned int groupNum;
    int ret = -1;

    if (virAsprintf(&devName, "%.4x:%.2x:%.2x.%.1x", addr->domain,
2132
                    addr->bus, addr->slot, addr->function) < 0)
L
Laine Stump 已提交
2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158
        goto cleanup;

    if (virPCIFile(&devPath, devName, "iommu_group") < 0)
        goto cleanup;
    if (virFileIsLink(devPath) != 1) {
        ret = -2;
        goto cleanup;
    }
    if (virFileResolveLink(devPath, &groupPath) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s iommu_group symlink %s"),
                       devName, devPath);
        goto cleanup;
    }

    groupNumStr = last_component(groupPath);
    if (virStrToLong_ui(groupNumStr, NULL, 10, &groupNum) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("device %s iommu_group symlink %s has "
                         "invalid group number %s"),
                       devName, groupPath, groupNumStr);
        ret = -1;
        goto cleanup;
    }

    ret = groupNum;
2159
 cleanup:
L
Laine Stump 已提交
2160 2161 2162 2163 2164 2165 2166
    VIR_FREE(devName);
    VIR_FREE(devPath);
    VIR_FREE(groupPath);
    return ret;
}


2167 2168
/* virPCIDeviceGetIOMMUGroupDev - return the name of the device used
 * to control this PCI device's group (e.g. "/dev/vfio/15")
2169 2170
 */
char *
2171
virPCIDeviceGetIOMMUGroupDev(virPCIDevicePtr dev)
2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191
{
    char *devPath = NULL;
    char *groupPath = NULL;
    char *groupDev = NULL;

    if (virPCIFile(&devPath, dev->name, "iommu_group") < 0)
        goto cleanup;
    if (virFileIsLink(devPath) != 1) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Invalid device %s iommu_group file %s is not a symlink"),
                       dev->name, devPath);
        goto cleanup;
    }
    if (virFileResolveLink(devPath, &groupPath) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s iommu_group symlink %s"),
                       dev->name, devPath);
        goto cleanup;
    }
    if (virAsprintf(&groupDev, "/dev/vfio/%s",
2192
                    last_component(groupPath)) < 0)
2193
        goto cleanup;
2194
 cleanup:
2195 2196 2197 2198 2199
    VIR_FREE(devPath);
    VIR_FREE(groupPath);
    return groupDev;
}

J
Jiri Denemark 已提交
2200
static int
2201
virPCIDeviceDownstreamLacksACS(virPCIDevicePtr dev)
J
Jiri Denemark 已提交
2202 2203 2204 2205
{
    uint16_t flags;
    uint16_t ctrl;
    unsigned int pos;
2206 2207
    int fd;
    int ret = 0;
2208
    uint16_t device_class;
J
Jiri Denemark 已提交
2209

2210
    if ((fd = virPCIDeviceConfigOpen(dev, true)) < 0)
J
Jiri Denemark 已提交
2211 2212
        return -1;

2213
    if (virPCIDeviceInit(dev, fd) < 0) {
2214 2215 2216 2217
        ret = -1;
        goto cleanup;
    }

2218 2219 2220
    if (virPCIDeviceReadClass(dev, &device_class) < 0)
        goto cleanup;

J
Jiri Denemark 已提交
2221
    pos = dev->pcie_cap_pos;
2222
    if (!pos || device_class != PCI_CLASS_BRIDGE_PCI)
2223
        goto cleanup;
J
Jiri Denemark 已提交
2224

2225
    flags = virPCIDeviceRead16(dev, fd, pos + PCI_EXP_FLAGS);
J
Jiri Denemark 已提交
2226
    if (((flags & PCI_EXP_FLAGS_TYPE) >> 4) != PCI_EXP_TYPE_DOWNSTREAM)
2227
        goto cleanup;
J
Jiri Denemark 已提交
2228

2229
    pos = virPCIDeviceFindExtendedCapabilityOffset(dev, fd, PCI_EXT_CAP_ID_ACS);
J
Jiri Denemark 已提交
2230 2231
    if (!pos) {
        VIR_DEBUG("%s %s: downstream port lacks ACS", dev->id, dev->name);
2232 2233
        ret = 1;
        goto cleanup;
J
Jiri Denemark 已提交
2234 2235
    }

2236
    ctrl = virPCIDeviceRead16(dev, fd, pos + PCI_EXT_ACS_CTRL);
J
Jiri Denemark 已提交
2237 2238 2239
    if ((ctrl & PCI_EXT_CAP_ACS_ENABLED) != PCI_EXT_CAP_ACS_ENABLED) {
        VIR_DEBUG("%s %s: downstream port has ACS disabled",
                  dev->id, dev->name);
2240 2241
        ret = 1;
        goto cleanup;
J
Jiri Denemark 已提交
2242 2243
    }

2244
 cleanup:
2245
    virPCIDeviceConfigClose(dev, fd);
2246
    return ret;
J
Jiri Denemark 已提交
2247 2248 2249
}

static int
2250
virPCIDeviceIsBehindSwitchLackingACS(virPCIDevicePtr dev)
J
Jiri Denemark 已提交
2251
{
2252
    virPCIDevicePtr parent;
J
Jiri Denemark 已提交
2253

2254
    if (virPCIDeviceGetParent(dev, &parent) < 0)
2255
        return -1;
2256 2257 2258 2259 2260 2261 2262 2263
    if (!parent) {
        /* if we have no parent, and this is the root bus, ACS doesn't come
         * into play since devices on the root bus can't P2P without going
         * through the root IOMMU.
         */
        if (dev->bus == 0)
            return 0;
        else {
2264
            virReportError(VIR_ERR_INTERNAL_ERROR,
2265 2266 2267 2268
                           _("Failed to find parent device for %s"),
                           dev->name);
            return -1;
        }
J
Jiri Denemark 已提交
2269 2270 2271 2272 2273 2274 2275
    }

    /* XXX we should rather fail when we can't find device's parent and
     * stop the loop when we get to root instead of just stopping when no
     * parent can be found
     */
    do {
2276
        virPCIDevicePtr tmp;
J
Jiri Denemark 已提交
2277
        int acs;
2278
        int ret;
J
Jiri Denemark 已提交
2279

2280
        acs = virPCIDeviceDownstreamLacksACS(parent);
J
Jiri Denemark 已提交
2281 2282

        if (acs) {
2283
            virPCIDeviceFree(parent);
J
Jiri Denemark 已提交
2284 2285 2286 2287 2288 2289 2290
            if (acs < 0)
                return -1;
            else
                return 1;
        }

        tmp = parent;
2291 2292
        ret = virPCIDeviceGetParent(parent, &parent);
        virPCIDeviceFree(tmp);
2293 2294
        if (ret < 0)
            return -1;
J
Jiri Denemark 已提交
2295 2296 2297 2298 2299
    } while (parent);

    return 0;
}

2300 2301
int virPCIDeviceIsAssignable(virPCIDevicePtr dev,
                             int strict_acs_check)
J
Jiri Denemark 已提交
2302 2303 2304 2305 2306 2307 2308 2309
{
    int ret;

    /* XXX This could be a great place to actually check that a non-managed
     * device isn't in use, e.g. by checking that device is either un-bound
     * or bound to a stub driver.
     */

2310
    ret = virPCIDeviceIsBehindSwitchLackingACS(dev);
J
Jiri Denemark 已提交
2311 2312 2313 2314 2315 2316 2317 2318
    if (ret < 0)
        return 0;

    if (ret) {
        if (!strict_acs_check) {
            VIR_DEBUG("%s %s: strict ACS check disabled; device assignment allowed",
                      dev->id, dev->name);
        } else {
2319
            virReportError(VIR_ERR_INTERNAL_ERROR,
J
Jiri Denemark 已提交
2320 2321 2322 2323 2324 2325 2326 2327 2328
                           _("Device %s is behind a switch lacking ACS and "
                             "cannot be assigned"),
                           dev->name);
            return 0;
        }
    }

    return 1;
}
2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347

static int
logStrToLong_ui(char const *s,
                char **end_ptr,
                int base,
                unsigned int *result)
{
    int ret = 0;

    ret = virStrToLong_ui(s, end_ptr, base, result);
    if (ret != 0) {
        VIR_ERROR(_("Failed to convert '%s' to unsigned int"), s);
    } else {
        VIR_DEBUG("Converted '%s' to unsigned int %u", s, *result);
    }

    return ret;
}

2348 2349
int
virPCIDeviceAddressParse(char *address,
2350
                         virPCIDeviceAddressPtr bdf)
2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376
{
    char *p = NULL;
    int ret = -1;

    if ((address == NULL) || (logStrToLong_ui(address, &p, 16,
                                              &bdf->domain) == -1)) {
        goto out;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->bus) == -1)) {
        goto out;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->slot) == -1)) {
        goto out;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->function) == -1)) {
        goto out;
    }

    ret = 0;

2377
 out:
2378 2379 2380
    return ret;
}

2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395
#ifdef __linux__

/*
 * returns true if equal
 */
static bool
virPCIDeviceAddressIsEqual(virPCIDeviceAddressPtr bdf1,
                           virPCIDeviceAddressPtr bdf2)
{
    return ((bdf1->domain == bdf2->domain) &&
            (bdf1->bus == bdf2->bus) &&
            (bdf1->slot == bdf2->slot) &&
            (bdf1->function == bdf2->function));
}

2396
static int
2397 2398
virPCIGetDeviceAddressFromSysfsLink(const char *device_link,
                                    virPCIDeviceAddressPtr *bdf)
2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412
{
    char *config_address = NULL;
    char *device_path = NULL;
    char errbuf[64];
    int ret = -1;

    VIR_DEBUG("Attempting to resolve device path from device link '%s'",
              device_link);

    if (!virFileExists(device_link)) {
        VIR_DEBUG("sysfs_path '%s' does not exist", device_link);
        return ret;
    }

2413
    device_path = canonicalize_file_name(device_link);
2414 2415
    if (device_path == NULL) {
        memset(errbuf, '\0', sizeof(errbuf));
2416 2417 2418
        virReportSystemError(errno,
                             _("Failed to resolve device link '%s'"),
                             device_link);
2419 2420 2421
        return ret;
    }

2422
    config_address = last_component(device_path);
2423
    if (VIR_ALLOC(*bdf) != 0)
2424 2425
        goto out;

2426
    if (virPCIDeviceAddressParse(config_address, *bdf) != 0) {
2427
        virReportError(VIR_ERR_INTERNAL_ERROR,
2428 2429 2430 2431 2432 2433
                       _("Failed to parse PCI config address '%s'"),
                       config_address);
        VIR_FREE(*bdf);
        goto out;
    }

2434
    VIR_DEBUG("virPCIDeviceAddress %.4x:%.2x:%.2x.%.1x",
2435 2436 2437 2438 2439 2440 2441
              (*bdf)->domain,
              (*bdf)->bus,
              (*bdf)->slot,
              (*bdf)->function);

    ret = 0;

2442
 out:
2443 2444 2445 2446 2447 2448 2449 2450 2451
    VIR_FREE(device_path);

    return ret;
}

/*
 * Returns Physical function given a virtual function
 */
int
2452 2453
virPCIGetPhysicalFunction(const char *vf_sysfs_path,
                          virPCIDeviceAddressPtr *physical_function)
2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464
{
    int ret = -1;
    char *device_link = NULL;

    VIR_DEBUG("Attempting to get SR IOV physical function for device "
              "with sysfs path '%s'", vf_sysfs_path);

    if (virBuildPath(&device_link, vf_sysfs_path, "physfn") == -1) {
        virReportOOMError();
        return ret;
    } else {
2465 2466
        ret = virPCIGetDeviceAddressFromSysfsLink(device_link,
                                                  physical_function);
2467 2468 2469 2470 2471 2472 2473
    }

    VIR_FREE(device_link);

    return ret;
}

2474

2475 2476 2477 2478
/*
 * Returns virtual functions of a physical function
 */
int
2479 2480
virPCIGetVirtualFunctions(const char *sysfs_path,
                          virPCIDeviceAddressPtr **virtual_functions,
2481
                          size_t *num_virtual_functions)
2482 2483
{
    int ret = -1;
2484
    size_t i;
2485
    char *device_link = NULL;
2486
    virPCIDeviceAddress *config_addr = NULL;
2487 2488 2489 2490

    VIR_DEBUG("Attempting to get SR IOV virtual functions for device"
              "with sysfs path '%s'", sysfs_path);

2491 2492 2493
    *virtual_functions = NULL;
    *num_virtual_functions = 0;

2494 2495 2496 2497
    do {
        /* look for virtfn%d links until one isn't found */
        if (virAsprintf(&device_link, "%s/virtfn%zu", sysfs_path, *num_virtual_functions) < 0)
            goto error;
2498

2499 2500
        if (!virFileExists(device_link))
            break;
2501

2502 2503 2504 2505 2506 2507
        if (virPCIGetDeviceAddressFromSysfsLink(device_link, &config_addr) < 0) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Failed to get SRIOV function from device link '%s'"),
                           device_link);
            goto error;
        }
2508

2509 2510 2511 2512
        VIR_DEBUG("Found virtual function %zu", *num_virtual_functions);
        if (VIR_APPEND_ELEMENT(*virtual_functions, *num_virtual_functions, config_addr) < 0)
            goto error;
        VIR_FREE(device_link);
2513

2514
    } while (1);
2515 2516

    ret = 0;
2517
 cleanup:
2518
    VIR_FREE(device_link);
2519
    VIR_FREE(config_addr);
2520
    return ret;
2521

2522
 error:
2523 2524 2525
    for (i = 0; i < *num_virtual_functions; i++)
        VIR_FREE((*virtual_functions)[i]);
    VIR_FREE(*virtual_functions);
2526
    goto cleanup;
2527
}
2528

2529

2530 2531 2532 2533
/*
 * Returns 1 if vf device is a virtual function, 0 if not, -1 on error
 */
int
2534
virPCIIsVirtualFunction(const char *vf_sysfs_device_link)
2535 2536 2537 2538 2539
{
    char *vf_sysfs_physfn_link = NULL;
    int ret = -1;

    if (virAsprintf(&vf_sysfs_physfn_link, "%s/physfn",
2540
                    vf_sysfs_device_link) < 0)
2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553
        return ret;

    ret = virFileExists(vf_sysfs_physfn_link);

    VIR_FREE(vf_sysfs_physfn_link);

    return ret;
}

/*
 * Returns the sriov virtual function index of vf given its pf
 */
int
2554 2555 2556
virPCIGetVirtualFunctionIndex(const char *pf_sysfs_device_link,
                              const char *vf_sysfs_device_link,
                              int *vf_index)
2557
{
2558 2559
    int ret = -1;
    size_t i;
2560
    size_t num_virt_fns = 0;
2561 2562
    virPCIDeviceAddressPtr vf_bdf = NULL;
    virPCIDeviceAddressPtr *virt_fns = NULL;
2563

2564 2565
    if (virPCIGetDeviceAddressFromSysfsLink(vf_sysfs_device_link,
                                            &vf_bdf) < 0)
2566 2567
        return ret;

2568 2569
    if (virPCIGetVirtualFunctions(pf_sysfs_device_link, &virt_fns,
                                  &num_virt_fns) < 0) {
2570
        virReportError(VIR_ERR_INTERNAL_ERROR,
2571
                       _("Error getting physical function's '%s' "
2572
                         "virtual_functions"), pf_sysfs_device_link);
2573 2574 2575 2576
        goto out;
    }

    for (i = 0; i < num_virt_fns; i++) {
2577 2578 2579 2580 2581
        if (virPCIDeviceAddressIsEqual(vf_bdf, virt_fns[i])) {
            *vf_index = i;
            ret = 0;
            break;
        }
2582 2583
    }

2584
 out:
2585 2586 2587

    /* free virtual functions */
    for (i = 0; i < num_virt_fns; i++)
2588
        VIR_FREE(virt_fns[i]);
2589

A
ajia@redhat.com 已提交
2590
    VIR_FREE(virt_fns);
2591 2592 2593 2594 2595
    VIR_FREE(vf_bdf);

    return ret;
}

2596 2597 2598 2599 2600
/*
 * Returns a path to the PCI sysfs file given the BDF of the PCI function
 */

int
2601
virPCIGetSysfsFile(char *virPCIDeviceName, char **pci_sysfs_device_link)
2602
{
2603 2604 2605 2606
    if (virAsprintf(pci_sysfs_device_link, PCI_SYSFS "devices/%s",
                    virPCIDeviceName) < 0)
        return -1;
    return 0;
2607 2608
}

R
Roopa Prabhu 已提交
2609
int
2610 2611
virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddressPtr dev,
                                char **pci_sysfs_device_link)
R
Roopa Prabhu 已提交
2612
{
2613 2614 2615 2616 2617
    if (virAsprintf(pci_sysfs_device_link,
                    PCI_SYSFS "devices/%04x:%02x:%02x.%x", dev->domain,
                    dev->bus, dev->slot, dev->function) < 0)
        return -1;
    return 0;
R
Roopa Prabhu 已提交
2618 2619
}

2620 2621 2622 2623
/*
 * Returns the network device name of a pci device
 */
int
2624 2625 2626 2627 2628 2629
virPCIGetNetName(char *device_link_sysfs_path, char **netname)
{
    char *pcidev_sysfs_net_path = NULL;
    int ret = -1;
    DIR *dir = NULL;
    struct dirent *entry = NULL;
2630

2631 2632 2633 2634 2635 2636 2637 2638 2639
    if (virBuildPath(&pcidev_sysfs_net_path, device_link_sysfs_path,
                     "net") == -1) {
        virReportOOMError();
        return -1;
    }

    dir = opendir(pcidev_sysfs_net_path);
    if (dir == NULL)
        goto out;
2640

2641 2642 2643 2644 2645 2646
    while ((entry = readdir(dir))) {
        if (STREQ(entry->d_name, ".") ||
            STREQ(entry->d_name, ".."))
            continue;

        /* Assume a single directory entry */
2647
        if (VIR_STRDUP(*netname, entry->d_name) > 0)
2648 2649 2650 2651 2652
            ret = 0;
        break;
    }

    closedir(dir);
2653

2654
 out:
2655
    VIR_FREE(pcidev_sysfs_net_path);
2656

2657
    return ret;
2658
}
R
Roopa Prabhu 已提交
2659 2660

int
2661 2662
virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path,
                             char **pfname, int *vf_index)
R
Roopa Prabhu 已提交
2663
{
2664
    virPCIDeviceAddressPtr pf_config_address = NULL;
R
Roopa Prabhu 已提交
2665 2666 2667
    char *pf_sysfs_device_path = NULL;
    int ret = -1;

2668
    if (virPCIGetPhysicalFunction(vf_sysfs_device_path, &pf_config_address) < 0)
R
Roopa Prabhu 已提交
2669 2670
        return ret;

2671 2672
    if (virPCIDeviceAddressGetSysfsFile(pf_config_address,
                                        &pf_sysfs_device_path) < 0) {
R
Roopa Prabhu 已提交
2673 2674 2675 2676 2677

        VIR_FREE(pf_config_address);
        return ret;
    }

2678 2679
    if (virPCIGetVirtualFunctionIndex(pf_sysfs_device_path, vf_sysfs_device_path,
                                      vf_index) < 0)
R
Roopa Prabhu 已提交
2680 2681
        goto cleanup;

2682
    ret = virPCIGetNetName(pf_sysfs_device_path, pfname);
R
Roopa Prabhu 已提交
2683

2684
 cleanup:
R
Roopa Prabhu 已提交
2685 2686 2687 2688 2689 2690
    VIR_FREE(pf_config_address);
    VIR_FREE(pf_sysfs_device_path);

    return ret;
}

2691
#else
2692 2693
static const char *unsupported = N_("not supported on non-linux platforms");

2694
int
2695 2696
virPCIGetPhysicalFunction(const char *vf_sysfs_path ATTRIBUTE_UNUSED,
                          virPCIDeviceAddressPtr *physical_function ATTRIBUTE_UNUSED)
2697
{
2698
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2699 2700 2701 2702
    return -1;
}

int
2703 2704
virPCIGetVirtualFunctions(const char *sysfs_path ATTRIBUTE_UNUSED,
                          virPCIDeviceAddressPtr **virtual_functions ATTRIBUTE_UNUSED,
2705
                          size_t *num_virtual_functions ATTRIBUTE_UNUSED)
2706
{
2707
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2708 2709
    return -1;
}
2710 2711

int
E
Eric Blake 已提交
2712
virPCIIsVirtualFunction(const char *vf_sysfs_device_link ATTRIBUTE_UNUSED)
2713
{
2714
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2715 2716 2717 2718
    return -1;
}

int
2719 2720 2721
virPCIGetVirtualFunctionIndex(const char *pf_sysfs_device_link ATTRIBUTE_UNUSED,
                              const char *vf_sysfs_device_link ATTRIBUTE_UNUSED,
                              int *vf_index ATTRIBUTE_UNUSED)
2722
{
2723
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2724 2725 2726 2727
    return -1;

}

2728
int
2729 2730
virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddressPtr dev ATTRIBUTE_UNUSED,
                                char **pci_sysfs_device_link ATTRIBUTE_UNUSED)
2731
{
2732
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2733 2734 2735
    return -1;
}

2736
int
2737
virPCIGetNetName(char *device_link_sysfs_path ATTRIBUTE_UNUSED,
2738
                 char **netname ATTRIBUTE_UNUSED)
2739
{
2740
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2741 2742
    return -1;
}
R
Roopa Prabhu 已提交
2743 2744

int
2745 2746 2747
virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path ATTRIBUTE_UNUSED,
                             char **pfname ATTRIBUTE_UNUSED,
                             int *vf_index ATTRIBUTE_UNUSED)
R
Roopa Prabhu 已提交
2748
{
2749
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
R
Roopa Prabhu 已提交
2750 2751
    return -1;
}
2752
#endif /* __linux__ */