eeh_driver.c 19.6 KB
Newer Older
1 2
/*
 * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
3 4
 * Copyright IBM Corp. 2004 2005
 * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
23
 * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
24 25 26
 */
#include <linux/delay.h>
#include <linux/interrupt.h>
27
#include <linux/irq.h>
28
#include <linux/module.h>
29 30 31 32 33 34 35 36
#include <linux/pci.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
#include <asm/ppc-pci.h>
#include <asm/pci-bridge.h>
#include <asm/prom.h>
#include <asm/rtas.h>

37 38 39 40 41 42 43
/**
 * eeh_pcid_name - Retrieve name of PCI device driver
 * @pdev: PCI device
 *
 * This routine is used to retrieve the name of PCI device driver
 * if that's valid.
 */
44
static inline const char *eeh_pcid_name(struct pci_dev *pdev)
45
{
46
	if (pdev && pdev->dev.driver)
47 48 49 50
		return pdev->dev.driver->name;
	return "";
}

51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
/**
 * eeh_pcid_get - Get the PCI device driver
 * @pdev: PCI device
 *
 * The function is used to retrieve the PCI device driver for
 * the indicated PCI device. Besides, we will increase the reference
 * of the PCI device driver to prevent that being unloaded on
 * the fly. Otherwise, kernel crash would be seen.
 */
static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
{
	if (!pdev || !pdev->driver)
		return NULL;

	if (!try_module_get(pdev->driver->driver.owner))
		return NULL;

	return pdev->driver;
}

/**
 * eeh_pcid_put - Dereference on the PCI device driver
 * @pdev: PCI device
 *
 * The function is called to do dereference on the PCI device
 * driver of the indicated PCI device.
 */
static inline void eeh_pcid_put(struct pci_dev *pdev)
{
	if (!pdev || !pdev->driver)
		return;

	module_put(pdev->driver->driver.owner);
}

86
#if 0
87
static void print_device_node_tree(struct pci_dn *pdn, int dent)
88 89
{
	int i;
90 91 92 93 94
	struct device_node *pc;

	if (!pdn)
		return;
	for (i = 0; i < dent; i++)
95 96 97 98 99
		printk(" ");
	printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
		pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
		pdn->eeh_pe_config_addr, pdn->node->full_name);
	dent += 3;
100
	pc = pdn->node->child;
101 102 103 104 105 106 107
	while (pc) {
		print_device_node_tree(PCI_DN(pc), dent);
		pc = pc->sibling;
	}
}
#endif

108
/**
109 110 111 112 113 114 115 116
 * eeh_disable_irq - Disable interrupt for the recovering device
 * @dev: PCI device
 *
 * This routine must be called when reporting temporary or permanent
 * error to the particular PCI device to disable interrupt of that
 * device. If the device has enabled MSI or MSI-X interrupt, we needn't
 * do real work because EEH should freeze DMA transfers for those PCI
 * devices encountering EEH errors, which includes MSI or MSI-X.
117 118 119
 */
static void eeh_disable_irq(struct pci_dev *dev)
{
120
	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
121 122 123 124

	/* Don't disable MSI and MSI-X interrupts. They are
	 * effectively disabled by the DMA Stopped state
	 * when an EEH error occurs.
125
	 */
126 127 128
	if (dev->msi_enabled || dev->msix_enabled)
		return;

129
	if (!irq_has_action(dev->irq))
130 131
		return;

G
Gavin Shan 已提交
132
	edev->mode |= EEH_DEV_IRQ_DISABLED;
133 134 135 136
	disable_irq_nosync(dev->irq);
}

/**
137 138 139 140 141
 * eeh_enable_irq - Enable interrupt for the recovering device
 * @dev: PCI device
 *
 * This routine must be called to enable interrupt while failed
 * device could be resumed.
142 143 144
 */
static void eeh_enable_irq(struct pci_dev *dev)
{
145
	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
146
	struct irq_desc *desc;
147

G
Gavin Shan 已提交
148 149
	if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
		edev->mode &= ~EEH_DEV_IRQ_DISABLED;
150 151 152 153

		desc = irq_to_desc(dev->irq);
		if (desc && desc->depth > 0)
			enable_irq(dev->irq);
154 155 156
	}
}

157
/**
158
 * eeh_report_error - Report pci error to each device driver
159
 * @data: eeh device
160
 * @userdata: return value
G
Gavin Shan 已提交
161 162 163
 *
 * Report an EEH error to each device driver, collect up and
 * merge the device driver responses. Cumulative response
164
 * passed back in "userdata".
165
 */
166
static void *eeh_report_error(void *data, void *userdata)
167
{
168 169
	struct eeh_dev *edev = (struct eeh_dev *)data;
	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
170
	enum pci_ers_result rc, *res = userdata;
171
	struct pci_driver *driver;
172

173 174 175 176
	/* We might not have the associated PCI device,
	 * then we should continue for next one.
	 */
	if (!dev) return NULL;
177 178
	dev->error_state = pci_channel_io_frozen;

179 180
	driver = eeh_pcid_get(dev);
	if (!driver) return NULL;
181

182 183
	eeh_disable_irq(dev);

184
	if (!driver->err_handler ||
185 186
	    !driver->err_handler->error_detected) {
		eeh_pcid_put(dev);
187
		return NULL;
188
	}
189

190
	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
191 192 193

	/* A driver that needs a reset trumps all others */
	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
194
	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
195

196
	eeh_pcid_put(dev);
197
	return NULL;
198 199 200
}

/**
201
 * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
202
 * @data: eeh device
203
 * @userdata: return value
204
 *
L
Linas Vepstas 已提交
205 206 207
 * Tells each device driver that IO ports, MMIO and config space I/O
 * are now enabled. Collects up and merges the device driver responses.
 * Cumulative response passed back in "userdata".
208
 */
209
static void *eeh_report_mmio_enabled(void *data, void *userdata)
210
{
211 212
	struct eeh_dev *edev = (struct eeh_dev *)data;
	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
213
	enum pci_ers_result rc, *res = userdata;
214
	struct pci_driver *driver;
215

216 217
	driver = eeh_pcid_get(dev);
	if (!driver) return NULL;
218

219 220 221
	if (!driver->err_handler ||
	    !driver->err_handler->mmio_enabled) {
		eeh_pcid_put(dev);
222
		return NULL;
223
	}
224

225
	rc = driver->err_handler->mmio_enabled(dev);
226 227 228

	/* A driver that needs a reset trumps all others */
	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
229
	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
230

231
	eeh_pcid_put(dev);
232
	return NULL;
233 234
}

235
/**
236
 * eeh_report_reset - Tell device that slot has been reset
237
 * @data: eeh device
238 239 240 241 242 243
 * @userdata: return value
 *
 * This routine must be called while EEH tries to reset particular
 * PCI device so that the associated PCI device driver could take
 * some actions, usually to save data the driver needs so that the
 * driver can work again while the device is recovered.
244
 */
245
static void *eeh_report_reset(void *data, void *userdata)
246
{
247 248
	struct eeh_dev *edev = (struct eeh_dev *)data;
	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
249
	enum pci_ers_result rc, *res = userdata;
250
	struct pci_driver *driver;
251

252
	if (!dev) return NULL;
253 254
	dev->error_state = pci_channel_io_normal;

255 256 257
	driver = eeh_pcid_get(dev);
	if (!driver) return NULL;

258 259
	eeh_enable_irq(dev);

260
	if (!driver->err_handler ||
261 262
	    !driver->err_handler->slot_reset) {
		eeh_pcid_put(dev);
263
		return NULL;
264
	}
265

266
	rc = driver->err_handler->slot_reset(dev);
267 268
	if ((*res == PCI_ERS_RESULT_NONE) ||
	    (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
269 270
	if (*res == PCI_ERS_RESULT_DISCONNECT &&
	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
271

272
	eeh_pcid_put(dev);
273
	return NULL;
274 275
}

276
/**
277
 * eeh_report_resume - Tell device to resume normal operations
278
 * @data: eeh device
279 280 281 282 283
 * @userdata: return value
 *
 * This routine must be called to notify the device driver that it
 * could resume so that the device driver can do some initialization
 * to make the recovered device work again.
284
 */
285
static void *eeh_report_resume(void *data, void *userdata)
286
{
287 288 289 290 291
	struct eeh_dev *edev = (struct eeh_dev *)data;
	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
	struct pci_driver *driver;

	if (!dev) return NULL;
292 293
	dev->error_state = pci_channel_io_normal;

294 295
	driver = eeh_pcid_get(dev);
	if (!driver) return NULL;
L
Linas Vepstas 已提交
296

297 298
	eeh_enable_irq(dev);

L
Linas Vepstas 已提交
299
	if (!driver->err_handler ||
300 301
	    !driver->err_handler->resume) {
		eeh_pcid_put(dev);
302
		return NULL;
303
	}
304 305

	driver->err_handler->resume(dev);
306

307
	eeh_pcid_put(dev);
308
	return NULL;
309 310
}

311
/**
312
 * eeh_report_failure - Tell device driver that device is dead.
313
 * @data: eeh device
314
 * @userdata: return value
315 316 317 318
 *
 * This informs the device driver that the device is permanently
 * dead, and that no further recovery attempts will be made on it.
 */
319
static void *eeh_report_failure(void *data, void *userdata)
320
{
321 322 323 324 325
	struct eeh_dev *edev = (struct eeh_dev *)data;
	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
	struct pci_driver *driver;

	if (!dev) return NULL;
326 327
	dev->error_state = pci_channel_io_perm_failure;

328 329
	driver = eeh_pcid_get(dev);
	if (!driver) return NULL;
330

331 332 333
	eeh_disable_irq(dev);

	if (!driver->err_handler ||
334 335
	    !driver->err_handler->error_detected) {
		eeh_pcid_put(dev);
336
		return NULL;
337
	}
338

339
	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
340

341
	eeh_pcid_put(dev);
342
	return NULL;
343 344
}

345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
static void *eeh_rmv_device(void *data, void *userdata)
{
	struct pci_driver *driver;
	struct eeh_dev *edev = (struct eeh_dev *)data;
	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
	int *removed = (int *)userdata;

	/*
	 * Actually, we should remove the PCI bridges as well.
	 * However, that's lots of complexity to do that,
	 * particularly some of devices under the bridge might
	 * support EEH. So we just care about PCI devices for
	 * simplicity here.
	 */
	if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE))
		return NULL;
	driver = eeh_pcid_get(dev);
	if (driver && driver->err_handler)
		return NULL;

	/* Remove it from PCI subsystem */
	pr_debug("EEH: Removing %s without EEH sensitive driver\n",
		 pci_name(dev));
	edev->bus = dev->bus;
	edev->mode |= EEH_DEV_DISCONNECTED;
	(*removed)++;

	pci_stop_and_remove_bus_device(dev);

	return NULL;
}

static void *eeh_pe_detach_dev(void *data, void *userdata)
{
	struct eeh_pe *pe = (struct eeh_pe *)data;
	struct eeh_dev *edev, *tmp;

	eeh_pe_for_each_dev(pe, edev, tmp) {
		if (!(edev->mode & EEH_DEV_DISCONNECTED))
			continue;

		edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED);
		eeh_rmv_from_parent_pe(edev);
	}

	return NULL;
}

393
/**
394
 * eeh_reset_device - Perform actual reset of a pci slot
395
 * @pe: EEH PE
396
 * @bus: PCI bus corresponding to the isolcated slot
397
 *
398 399 400
 * This routine must be called to do reset on the indicated PE.
 * During the reset, udev might be invoked because those affected
 * PCI devices will be removed and then added.
401
 */
402
static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
403
{
404
	struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
405
	struct timeval tstamp;
406
	int cnt, rc, removed = 0;
407 408

	/* pcibios will clear the counter; save the value */
409
	cnt = pe->freeze_count;
410
	tstamp = pe->tstamp;
411

412 413 414 415 416 417
	/*
	 * We don't remove the corresponding PE instances because
	 * we need the information afterwords. The attached EEH
	 * devices are expected to be attached soon when calling
	 * into pcibios_add_pci_devices().
	 */
418 419
	eeh_pe_state_mark(pe, EEH_PE_KEEP);
	if (bus)
G
Gavin Shan 已提交
420
		pcibios_remove_pci_devices(bus);
421 422
	else if (frozen_bus)
		eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed);
423 424

	/* Reset the pci controller. (Asserts RST#; resets config space).
425
	 * Reconfigure bridges and devices. Don't try to bring the system
426 427
	 * up if the reset failed for some reason.
	 */
428
	rc = eeh_reset_pe(pe);
429 430
	if (rc)
		return rc;
431

432 433 434
	/* Restore PE */
	eeh_ops->configure_bridge(pe);
	eeh_pe_restore_bars(pe);
435 436

	/* Give the system 5 seconds to finish running the user-space
G
Gavin Shan 已提交
437 438 439
	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes,
	 * this is a hack, but if we don't do this, and try to bring
	 * the device up before the scripts have taken it down,
440 441 442
	 * potentially weird things happen.
	 */
	if (bus) {
443
		pr_info("EEH: Sleep 5s ahead of complete hotplug\n");
444
		ssleep(5);
445 446 447 448 449 450 451

		/*
		 * The EEH device is still connected with its parent
		 * PE. We should disconnect it so the binding can be
		 * rebuilt when adding PCI devices.
		 */
		eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
452
		pcibios_add_pci_devices(bus);
453 454 455 456 457 458
	} else if (frozen_bus && removed) {
		pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
		ssleep(5);

		eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
		pcibios_add_pci_devices(frozen_bus);
459
	}
460
	eeh_pe_state_clear(pe, EEH_PE_KEEP);
461 462

	pe->tstamp = tstamp;
463
	pe->freeze_count = cnt;
464 465

	return 0;
466 467 468 469 470
}

/* The longest amount of time to wait for a pci device
 * to come back on line, in seconds.
 */
471
#define MAX_WAIT_FOR_RECOVERY 150
472

473
static void eeh_handle_normal_event(struct eeh_pe *pe)
474 475
{
	struct pci_bus *frozen_bus;
476
	int rc = 0;
477
	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
478

479
	frozen_bus = eeh_pe_bus_get(pe);
480
	if (!frozen_bus) {
481 482 483
		pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n",
			__func__, pe->phb->global_number, pe->addr);
		return;
484 485
	}

486
	eeh_pe_update_time_stamp(pe);
487 488
	pe->freeze_count++;
	if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
489
		goto excess_failures;
490 491
	pr_warning("EEH: This PCI device has failed %d times in the last hour\n",
		pe->freeze_count);
492 493 494 495 496 497 498

	/* Walk the various device drivers attached to this slot through
	 * a reset sequence, giving each an opportunity to do what it needs
	 * to accomplish the reset.  Each child gets a report of the
	 * status ... if any child can't handle the reset, then the entire
	 * slot is dlpar removed and added.
	 */
499
	pr_info("EEH: Notify device drivers to shutdown\n");
500
	eeh_pe_dev_traverse(pe, eeh_report_error, &result);
501

502
	/* Get the current PCI slot state. This can take a long time,
503 504
	 * sometimes over 3 seconds for certain systems.
	 */
505
	rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
506
	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
507
		pr_warning("EEH: Permanent failure\n");
508 509 510
		goto hard_fail;
	}

511 512
	/* Since rtas may enable MMIO when posting the error log,
	 * don't post the error log until after all dev drivers
513 514
	 * have been informed.
	 */
515
	pr_info("EEH: Collect temporary log\n");
516
	eeh_slot_error_detail(pe, EEH_LOG_TEMP);
517

518 519 520 521
	/* If all device drivers were EEH-unaware, then shut
	 * down all of the device drivers, and hope they
	 * go down willingly, without panicing the system.
	 */
522
	if (result == PCI_ERS_RESULT_NONE) {
523
		pr_info("EEH: Reset with hotplug activity\n");
524
		rc = eeh_reset_device(pe, frozen_bus);
525
		if (rc) {
526 527
			pr_warning("%s: Unable to reset, err=%d\n",
				   __func__, rc);
528
			goto hard_fail;
529
		}
530 531
	}

532 533
	/* If all devices reported they can proceed, then re-enable MMIO */
	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
534
		pr_info("EEH: Enable I/O for affected devices\n");
535
		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
536

537 538
		if (rc < 0)
			goto hard_fail;
539 540 541
		if (rc) {
			result = PCI_ERS_RESULT_NEED_RESET;
		} else {
542
			pr_info("EEH: Notify device drivers to resume I/O\n");
543
			result = PCI_ERS_RESULT_NONE;
544
			eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
545
		}
546 547
	}

548
	/* If all devices reported they can proceed, then re-enable DMA */
549
	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
550
		pr_info("EEH: Enabled DMA for affected devices\n");
551
		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
552

553 554
		if (rc < 0)
			goto hard_fail;
555 556
		if (rc)
			result = PCI_ERS_RESULT_NEED_RESET;
L
Linas Vepstas 已提交
557 558
		else
			result = PCI_ERS_RESULT_RECOVERED;
559 560 561
	}

	/* If any device has a hard failure, then shut off everything. */
562
	if (result == PCI_ERS_RESULT_DISCONNECT) {
563
		pr_warning("EEH: Device driver gave up\n");
564
		goto hard_fail;
565
	}
566 567 568

	/* If any device called out for a reset, then reset the slot */
	if (result == PCI_ERS_RESULT_NEED_RESET) {
569
		pr_info("EEH: Reset without hotplug activity\n");
570
		rc = eeh_reset_device(pe, NULL);
571
		if (rc) {
572 573
			pr_warning("%s: Cannot reset, err=%d\n",
				   __func__, rc);
574
			goto hard_fail;
575
		}
576 577 578

		pr_info("EEH: Notify device drivers "
			"the completion of reset\n");
579
		result = PCI_ERS_RESULT_NONE;
580
		eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
581 582
	}

583
	/* All devices should claim they have recovered by now. */
584 585
	if ((result != PCI_ERS_RESULT_RECOVERED) &&
	    (result != PCI_ERS_RESULT_NONE)) {
586
		pr_warning("EEH: Not recovered\n");
587
		goto hard_fail;
588
	}
589

590
	/* Tell all device drivers that they can resume operations */
591
	pr_info("EEH: Notify device driver to resume\n");
592
	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
593

594
	return;
G
Gavin Shan 已提交
595

596
excess_failures:
597 598 599 600 601
	/*
	 * About 90% of all real-life EEH failures in the field
	 * are due to poorly seated PCI cards. Only 10% or so are
	 * due to actual, failed cards.
	 */
602 603 604 605 606
	pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n"
	       "last hour and has been permanently disabled.\n"
	       "Please try reseating or replacing it.\n",
		pe->phb->global_number, pe->addr,
		pe->freeze_count);
607 608 609
	goto perm_error;

hard_fail:
610 611 612
	pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n"
	       "Please try reseating or replacing it\n",
		pe->phb->global_number, pe->addr);
613

614
perm_error:
615
	eeh_slot_error_detail(pe, EEH_LOG_PERM);
616 617

	/* Notify all devices that they're about to go down. */
618
	eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
619 620

	/* Shut down the device drivers for good. */
621 622
	if (frozen_bus)
		pcibios_remove_pci_devices(frozen_bus);
623
}
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732

static void eeh_handle_special_event(void)
{
	struct eeh_pe *pe, *phb_pe;
	struct pci_bus *bus;
	struct pci_controller *hose, *tmp;
	unsigned long flags;
	int rc = 0;

	/*
	 * The return value from next_error() has been classified as follows.
	 * It might be good to enumerate them. However, next_error() is only
	 * supported by PowerNV platform for now. So it would be fine to use
	 * integer directly:
	 *
	 * 4 - Dead IOC           3 - Dead PHB
	 * 2 - Fenced PHB         1 - Frozen PE
	 * 0 - No error found
	 *
	 */
	rc = eeh_ops->next_error(&pe);
	if (rc <= 0)
		return;

	switch (rc) {
	case 4:
		/* Mark all PHBs in dead state */
		eeh_serialize_lock(&flags);
		list_for_each_entry_safe(hose, tmp,
				&hose_list, list_node) {
			phb_pe = eeh_phb_pe_get(hose);
			if (!phb_pe) continue;

			eeh_pe_state_mark(phb_pe,
				EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
		}
		eeh_serialize_unlock(flags);

		/* Purge all events */
		eeh_remove_event(NULL);
		break;
	case 3:
	case 2:
	case 1:
		/* Mark the PE in fenced state */
		eeh_serialize_lock(&flags);
		if (rc == 3)
			eeh_pe_state_mark(pe,
				EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
		else
			eeh_pe_state_mark(pe,
				EEH_PE_ISOLATED | EEH_PE_RECOVERING);
		eeh_serialize_unlock(flags);

		/* Purge all events of the PHB */
		eeh_remove_event(pe);
		break;
	default:
		pr_err("%s: Invalid value %d from next_error()\n",
		       __func__, rc);
		return;
	}

	/*
	 * For fenced PHB and frozen PE, it's handled as normal
	 * event. We have to remove the affected PHBs for dead
	 * PHB and IOC
	 */
	if (rc == 2 || rc == 1)
		eeh_handle_normal_event(pe);
	else {
		list_for_each_entry_safe(hose, tmp,
			&hose_list, list_node) {
			phb_pe = eeh_phb_pe_get(hose);
			if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD))
				continue;

			bus = eeh_pe_bus_get(phb_pe);
			/* Notify all devices that they're about to go down. */
			eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
			pcibios_remove_pci_devices(bus);
		}
	}
}

/**
 * eeh_handle_event - Reset a PCI device after hard lockup.
 * @pe: EEH PE
 *
 * While PHB detects address or data parity errors on particular PCI
 * slot, the associated PE will be frozen. Besides, DMA's occurring
 * to wild addresses (which usually happen due to bugs in device
 * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
 * #PERR or other misc PCI-related errors also can trigger EEH errors.
 *
 * Recovery process consists of unplugging the device driver (which
 * generated hotplug events to userspace), then issuing a PCI #RST to
 * the device, then reconfiguring the PCI config space for all bridges
 * & devices under this slot, and then finally restarting the device
 * drivers (which cause a second set of hotplug events to go out to
 * userspace).
 */
void eeh_handle_event(struct eeh_pe *pe)
{
	if (pe)
		eeh_handle_normal_event(pe);
	else
		eeh_handle_special_event();
}