hmi.sh 2.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
#!/bin/sh
#
# Copyright 2015, Daniel Axtens, IBM Corporation
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.


# do we have ./getscom, ./putscom?
if [ -x ./getscom ] && [ -x ./putscom ]; then
	GETSCOM=./getscom
	PUTSCOM=./putscom
elif which getscom > /dev/null; then
	GETSCOM=$(which getscom)
	PUTSCOM=$(which putscom)
else
	cat <<EOF
Can't find getscom/putscom in . or \$PATH.
See https://github.com/open-power/skiboot.
The tool is in external/xscom-utils
EOF
	exit 1
fi

# We will get 8 HMI events per injection
# todo: deal with things being offline
expected_hmis=8
COUNT_HMIS() {
    dmesg | grep -c 'Harmless Hypervisor Maintenance interrupt'
}

# massively expand snooze delay, allowing injection on all cores
ppc64_cpu --smt-snooze-delay=1000000000

# when we exit, restore it
trap "ppc64_cpu --smt-snooze-delay=100" 0 1

# for each chip+core combination
# todo - less fragile parsing
egrep -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog |
while read chipcore; do
	chip=$(echo "$chipcore"|awk '{print $3}')
	core=$(echo "$chipcore"|awk '{print $5}')
	fir="0x1${core}013100"

	# verify that Core FIR is zero as expected
	if [ "$($GETSCOM -c 0x${chip} $fir)" != 0 ]; then
		echo "FIR was not zero before injection for chip $chip, core $core. Aborting!"
		echo "Result of $GETSCOM -c 0x${chip} $fir:"
		$GETSCOM -c 0x${chip} $fir
		echo "If you get a -5 error, the core may be in idle state. Try stress-ng."
		echo "Otherwise, try $PUTSCOM -c 0x${chip} $fir 0"
		exit 1
	fi

	# keep track of the number of HMIs handled
	old_hmis=$(COUNT_HMIS)

	# do injection, adding a marker to dmesg for clarity
	echo "Injecting HMI on core $core, chip $chip" | tee /dev/kmsg
	# inject a RegFile recoverable error
	if ! $PUTSCOM -c 0x${chip} $fir 2000000000000000 > /dev/null; then
		echo "Error injecting. Aborting!"
		exit 1
	fi

	# now we want to wait for all the HMIs to be processed
	# we expect one per thread on the core
	i=0;
	new_hmis=$(COUNT_HMIS)
	while [ $new_hmis -lt $((old_hmis + expected_hmis)) ] && [ $i -lt 12 ]; do
	    echo "Seen $((new_hmis - old_hmis)) HMI(s) out of $expected_hmis expected, sleeping"
	    sleep 5;
	    i=$((i + 1))
	    new_hmis=$(COUNT_HMIS)
	done
	if [ $i = 12 ]; then
	    echo "Haven't seen expected $expected_hmis recoveries after 1 min. Aborting."
	    exit 1
	fi
	echo "Processed $expected_hmis events; presumed success. Check dmesg."
	echo ""
done