eeh-basic.sh 2.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0-only

. ./eeh-functions.sh

if ! eeh_supported ; then
	echo "EEH not supported on this system, skipping"
	exit 0;
fi

if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
   [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
	echo "debugfs EEH testing files are missing. Is debugfs mounted?"
	exit 1;
fi

pre_lspci=`mktemp`
lspci > $pre_lspci

# Bump the max freeze count to something absurd so we don't
# trip over it while breaking things.
echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes

# record the devices that we break in here. Assuming everything
# goes to plan we should get them back once the recover process
# is finished.
devices=""

# Build up a list of candidate devices.
for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do
	# skip bridges since we can't recover them (yet...)
	if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
		echo "$dev, Skipped: bridge"
		continue;
	fi

	# Skip VFs for now since we don't have a reliable way
	# to break them.
	if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then
		echo "$dev, Skipped: virtfn"
		continue;
	fi

44 45 46 47 48
	if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then
		echo "$dev, Skipped: ahci doesn't support recovery"
		continue
	fi

49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
	# Don't inject errosr into an already-frozen PE. This happens with
	# PEs that contain multiple PCI devices (e.g. multi-function cards)
	# and injecting new errors during the recovery process will probably
	# result in the recovery failing and the device being marked as
	# failed.
	if ! pe_ok $dev ; then
		echo "$dev, Skipped: Bad initial PE state"
		continue;
	fi

	echo "$dev, Added"

	# Add to this list of device to check
	devices="$devices $dev"
done

dev_count="$(echo $devices | wc -w)"
echo "Found ${dev_count} breakable devices..."

failed=0
for dev in $devices ; do
	echo "Breaking $dev..."

	if ! pe_ok $dev ; then
		echo "Skipping $dev, Initial PE state is not ok"
		failed="$((failed + 1))"
		continue;
	fi

	if ! eeh_one_dev $dev ; then
		failed="$((failed + 1))"
	fi
done

echo "$failed devices failed to recover ($dev_count tested)"
lspci | diff -u $pre_lspci -
rm -f $pre_lspci

exit $failed