0001
0002
0003
0004
0005
0006
0007
0008
0009 if [ -x ./getscom ] && [ -x ./putscom ]; then
0010 GETSCOM=./getscom
0011 PUTSCOM=./putscom
0012 elif which getscom > /dev/null; then
0013 GETSCOM=$(which getscom)
0014 PUTSCOM=$(which putscom)
0015 else
0016 cat <<EOF
0017 Can't find getscom/putscom in . or \$PATH.
0018 See https://github.com/open-power/skiboot.
0019 The tool is in external/xscom-utils
0020 EOF
0021 exit 1
0022 fi
0023
0024 # We will get 8 HMI events per injection
0025 # todo: deal with things being offline
0026 expected_hmis=8
0027 COUNT_HMIS() {
0028 dmesg | grep -c 'Harmless Hypervisor Maintenance interrupt'
0029 }
0030
0031 # massively expand snooze delay, allowing injection on all cores
0032 ppc64_cpu --smt-snooze-delay=1000000000
0033
0034 # when we exit, restore it
0035 trap "ppc64_cpu --smt-snooze-delay=100" 0 1
0036
0037 # for each chip+core combination
0038 # todo - less fragile parsing
0039 egrep -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog |
0040 while read chipcore; do
0041 chip=$(echo "$chipcore"|awk '{print $3}')
0042 core=$(echo "$chipcore"|awk '{print $5}')
0043 fir="0x1${core}013100"
0044
0045 # verify that Core FIR is zero as expected
0046 if [ "$($GETSCOM -c 0x${chip} $fir)" != 0 ]; then
0047 echo "FIR was not zero before injection for chip $chip, core $core. Aborting!"
0048 echo "Result of $GETSCOM -c 0x${chip} $fir:"
0049 $GETSCOM -c 0x${chip} $fir
0050 echo "If you get a -5 error, the core may be in idle state. Try stress-ng."
0051 echo "Otherwise, try $PUTSCOM -c 0x${chip} $fir 0"
0052 exit 1
0053 fi
0054
0055 # keep track of the number of HMIs handled
0056 old_hmis=$(COUNT_HMIS)
0057
0058 # do injection, adding a marker to dmesg for clarity
0059 echo "Injecting HMI on core $core, chip $chip" | tee /dev/kmsg
0060 # inject a RegFile recoverable error
0061 if ! $PUTSCOM -c 0x${chip} $fir 2000000000000000 > /dev/null; then
0062 echo "Error injecting. Aborting!"
0063 exit 1
0064 fi
0065
0066 # now we want to wait for all the HMIs to be processed
0067 # we expect one per thread on the core
0068 i=0;
0069 new_hmis=$(COUNT_HMIS)
0070 while [ $new_hmis -lt $((old_hmis + expected_hmis)) ] && [ $i -lt 12 ]; do
0071 echo "Seen $((new_hmis - old_hmis)) HMI(s) out of $expected_hmis expected, sleeping"
0072 sleep 5;
0073 i=$((i + 1))
0074 new_hmis=$(COUNT_HMIS)
0075 done
0076 if [ $i = 12 ]; then
0077 echo "Haven't seen expected $expected_hmis recoveries after 1 min. Aborting."
0078 exit 1
0079 fi
0080 echo "Processed $expected_hmis events; presumed success. Check dmesg."
0081 echo ""
0082 done