cmd/intrd/intrd.pl

	intrd.pl revision 7ff178cd8db129d385d3177eb20744d3b6efc59b
2N/A#!/usr/perl5/bin/perl
2N/A#
2N/A# CDDL HEADER START
2N/A#
2N/A# The contents of this file are subject to the terms of the
2N/A# Common Development and Distribution License (the "License").
2N/A# You may not use this file except in compliance with the License.
2N/A#
2N/A# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
2N/A# or http://www.opensolaris.org/os/licensing.
2N/A# See the License for the specific language governing permissions
2N/A# and limitations under the License.
2N/A#
2N/A# When distributing Covered Code, include this CDDL HEADER in each
2N/A# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
2N/A# If applicable, add the following below this CDDL HEADER, with the
2N/A# fields enclosed by brackets "[]" replaced with your own identifying
2N/A# information: Portions Copyright [yyyy] [name of copyright owner]
2N/A#
2N/A# CDDL HEADER END
2N/A#
2N/A
2N/A#
2N/A# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
2N/A#
2N/A
2N/Arequire 5.8.4;
2N/Ause strict;
2N/Ause warnings;
2N/Ause POSIX;
2N/Ause File::Basename("basename");
2N/A
2N/Amy $cmdname = basename($0);
2N/A
2N/Amy $using_scengen = 0;  # 1 if using scenario simulator
2N/Amy $debug = 0;
2N/A
2N/Amy $normal_sleeptime = 10;      # time to sleep between samples
2N/Amy $idle_sleeptime = 45;        # time to sleep when idle
2N/Amy $onecpu_sleeptime = (60 * 15);   # used if only 1 CPU on system
2N/Amy $sleeptime = $normal_sleeptime;  # either normal_ or idle_ or onecpu_
2N/A
2N/Amy $idle_intrload = .1;         # idle if interrupt load < 10%
2N/A
2N/Amy $timerange_toohi    = .01;
2N/Amy $statslen = 60;  # time period (in secs) to keep in @deltas
2N/A
2N/A
2N/A# Parse arguments. intrd does not accept any public arguments; the two
2N/A# arguments below are meant for testing purposes. -D generates a significant
2N/A# amount of syslog output. -S <filename> loads the filename as a perl
2N/A# script. That file is expected to implement a kstat "simulator" which
2N/A# can be used to feed information to intrd and verify intrd's responses.
2N/A
2N/Awhile ($_ = shift @ARGV) {
2N/A    if ($_ eq "-S" && $#ARGV != -1) {
2N/A        $using_scengen = 1;
2N/A        do $ARGV[0];    # load simulator
2N/A        shift @ARGV;
2N/A    } elsif ($_ eq "-D") {
2N/A        $debug = 1;
2N/A    }
2N/A}
2N/A
2N/Aif ($using_scengen == 0) {
2N/A    require Sun::Solaris::Kstat;
2N/A    require Sun::Solaris::Intrs;
2N/A    import Sun::Solaris::Intrs(qw(intrmove is_apic));
2N/A    require Sys::Syslog;
2N/A    import Sys::Syslog;
2N/A    openlog($cmdname, 'pid', 'daemon');
2N/A    setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG :
2N/A        &Sys::Syslog::LOG_INFO));
2N/A}
2N/A
2N/Amy $asserted = 0;
2N/Amy $assert_level = 'debug'; # syslog level for assertion failures
2N/Asub VERIFY($@)
2N/A{
2N/A    my $bad = (shift() == 0);   # $_[0] == 0 means assert failed
2N/A    if ($bad) {
2N/A        my $msg = shift();
2N/A        syslog($assert_level, "VERIFY: $msg", @_);
2N/A        $asserted++;
2N/A    }
2N/A    return ($bad);
2N/A}
2N/A
2N/A
2N/A
2N/A
2N/Asub getstat($$);
2N/Asub generate_delta($$);
2N/Asub compress_deltas($);
2N/Asub dumpdelta($);
2N/A
2N/Asub goodness($);
2N/Asub imbalanced($$);
2N/Asub do_reconfig($);
2N/A
2N/Asub goodness_cpu($$);       # private function
2N/Asub move_intr($$$$);        # private function
2N/Asub ivecs_to_string(@);     # private function
2N/Asub do_find_goal($$$$);     # private function
2N/Asub find_goal($$);      # private function
2N/Asub do_reconfig_cpu2cpu($$$$);  # private function
2N/Asub do_reconfig_cpu($$$);   # private function
2N/A
2N/A
2N/A#
2N/A# What follow are the basic data structures routines of intrd.
2N/A#
2N/A# getstat() is responsible for reading the kstats and generating a "stat" hash.
2N/A#
2N/A# generate_delta() is responsible for taking two "stat" hashes and creating
2N/A# a new "delta" hash that represents what has changed over time.
2N/A#
2N/A# compress_deltas() is responsible for taking a list of deltas and generating
2N/A# a single delta hash that encompasses all the time periods described by the
2N/A# deltas.
2N/A
2N/A
2N/A#
2N/A# getstat() is handed a reference to a kstat and generates a hash, returned
2N/A# by reference, containing all the fields from the kstats which we need.
2N/A# If it returns the scalar 0, it failed to gather the kstats, and the caller
2N/A# should react accordingly.
2N/A#
2N/A# getstat() is also responsible for maintaining a reasonable $sleeptime.
2N/A#
2N/A# {"snaptime"}          kstat's snaptime
2N/A# {<cpuid>}             one hash reference per online cpu
2N/A#  ->{"tot"}            == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
2N/A#  ->{"crtime"}         == cpu:<cpuid>:sys:crtime
2N/A#  ->{"ivecs"}
2N/A#     ->{<cookie#>}     iterates over pci_intrs::<nexus>:cookie
2N/A#        ->{"time"}     == pci_intrs:<ivec#>:<nexus>:time (in nsec)
2N/A#        ->{"pil"}      == pci_intrs:<ivec#>:<nexus>:pil
2N/A#        ->{"crtime"}   == pci_intrs:<ivec#>:<nexus>:crtime
2N/A#        ->{"ino"}      == pci_intrs:<ivec#>:<nexus>:ino
2N/A#        ->{"num_ino"}  == num inos of single device instance sharing this entry
2N/A#               Will be > 1 on pcplusmp X86 systems for devices
2N/A#               with multiple MSI interrupts.
2N/A#        ->{"buspath"}  == pci_intrs:<ivec#>:<nexus>:buspath
2N/A#        ->{"name"}     == pci_intrs:<ivec#>:<nexus>:name
2N/A#        ->{"ihs"}      == pci_intrs:<ivec#>:<nexus>:ihs
2N/A#
2N/A
2N/Asub getstat($$)
2N/A{
2N/A    my ($ks, $pcplusmp_sys) = @_;
2N/A
2N/A    my $cpucnt = 0;
2N/A    my %stat = ();
2N/A    my ($minsnap, $maxsnap);
2N/A
2N/A    # Hash of hash which matches (MSI device, ino) combos to kstats.
2N/A    my %msidevs = ();
2N/A
2N/A    # kstats are not generated atomically. Each kstat hierarchy will
2N/A    # have been generated within the kernel at a different time. On a
2N/A    # thrashing system, we may not run quickly enough in order to get
2N/A    # coherent kstat timing information across all the kstats. To
2N/A    # determine if this is occurring, $minsnap/$maxsnap are used to
2N/A    # find the breadth between the first and last snaptime of all the
2N/A    # kstats we access. $maxsnap - $minsnap roughly represents the
2N/A    # total time taken up in getstat(). If this time approaches the
2N/A    # time between snapshots, our results may not be useful.
2N/A
2N/A    $minsnap = -1;      # snaptime is always a positive number
2N/A    $maxsnap = $minsnap;
2N/A
2N/A    # Iterate over the cpus in cpu:<cpuid>::. Check
2N/A    # cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
2N/A    # processor is "on-line". If not, it isn't accepting interrupts
2N/A    # and doesn't concern us.
2N/A    #
2N/A    # Record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
2N/A
2N/A    while (my ($cpu, $cpst) = each %{$ks->{cpu}}) {
2N/A        next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state});
2N/A        #"state" fld of kstat w/
2N/A        #         modname    inst name-"cpuinfo0"
2N/A        my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state};
2N/A        next if ($state !~ /^on-line\0/);
2N/A        my $cpu_sys = $cpst->{sys};
2N/A
2N/A        $stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} +
2N/A                    $cpu_sys->{cpu_nsec_user} +
2N/A                    $cpu_sys->{cpu_nsec_kernel});
2N/A        $stat{$cpu}{crtime} = $cpu_sys->{crtime};
2N/A        $stat{$cpu}{ivecs} = {};
2N/A
2N/A        if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) {
2N/A            $minsnap = $cpu_sys->{snaptime};
2N/A        }
2N/A        if ($cpu_sys->{snaptime} > $maxsnap) {
2N/A            $maxsnap = $cpu_sys->{snaptime};
2N/A        }
2N/A        $cpucnt++;
2N/A    }
2N/A
2N/A    if ($cpucnt <= 1) {
2N/A        $sleeptime = $onecpu_sleeptime;
2N/A        return (0); # nothing to do with 1 CPU
2N/A    }
2N/A
2N/A    # Iterate over the ivecs. If the cpu is not on-line, ignore the
2N/A    # ivecs mapped to it, if any.
2N/A    #
2N/A    # Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil,
2N/A    # ino, name, and buspath. Check $minsnap/$maxsnap.
2N/A
2N/A    foreach my $inst (values(%{$ks->{pci_intrs}})) {
2N/A        my $intrcfg = (values(%$inst))[0];
2N/A        my $cpu = $intrcfg->{cpu};
2N/A
2N/A        next unless exists $stat{$cpu};
2N/A        next if ($intrcfg->{type} =~ /^disabled\0/);
2N/A
2N/A        # Perl looks beyond NULL chars in pattern matching.
2N/A        # Truncate name field at the first NULL
2N/A        $intrcfg->{name} =~ s/\0.*$//;
2N/A
2N/A        if ($intrcfg->{snaptime} < $minsnap) {
2N/A            $minsnap = $intrcfg->{snaptime};
2N/A        } elsif ($intrcfg->{snaptime} > $maxsnap) {
2N/A            $maxsnap = $intrcfg->{snaptime};
2N/A        }
2N/A
2N/A        my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}";
2N/A        if (exists $stat{$cpu}{ivecs}{$cookie}) {
2N/A            my $cookiestats = $stat{$cpu}{ivecs}{$cookie};
2N/A
2N/A            $cookiestats->{time} += $intrcfg->{time};
2N/A            $cookiestats->{name} .= "/$intrcfg->{name}";
2N/A
2N/A            # If this new interrupt sharing $cookie represents a
2N/A            # change from an earlier getstat, make sure that
2N/A            # generate_delta will see the change by setting
2N/A            # crtime to the most recent crtime of its components.
2N/A
2N/A            if ($intrcfg->{crtime} > $cookiestats->{crtime}) {
2N/A                $cookiestats->{crtime} = $intrcfg->{crtime};
2N/A            }
2N/A            $cookiestats->{ihs}++;
2N/A            next;
2N/A        }
2N/A        $stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time};
2N/A        $stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime};
2N/A        $stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil};
2N/A        $stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino};
2N/A        $stat{$cpu}{ivecs}{$cookie}{num_ino} = 1;
2N/A        $stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath};
2N/A        $stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name};
2N/A        $stat{$cpu}{ivecs}{$cookie}{ihs} = 1;
2N/A
2N/A        if ($pcplusmp_sys && ($intrcfg->{type} =~ /^msi\0/)) {
2N/A            if (!(exists($msidevs{$intrcfg->{name}}))) {
2N/A                $msidevs{$intrcfg->{name}} = {};
2N/A            }
2N/A            $msidevs{$intrcfg->{name}}{$intrcfg->{ino}} =
2N/A                \$stat{$cpu}{ivecs}{$cookie};
2N/A        }
2N/A    }
2N/A
2N/A    # All MSI interrupts of a device instance share a single MSI address.
2N/A    # On X86 systems with an APIC, this MSI address is interpreted as CPU
2N/A    # routing info by the APIC.  For this reason, on these platforms, all
2N/A    # interrupts for MSI devices must be moved to the same CPU at the same
2N/A    # time.
2N/A    #
2N/A    # Since all interrupts will be on the same CPU on these platforms, all
2N/A    # interrupts can be consolidated into one ivec entry.  For such devices,
2N/A    # num_ino will be > 1 to denote that a group move is needed.
2N/A
2N/A    # Loop thru all MSI devices on X86 pcplusmp systems.
2N/A    # Nop on other systems.
2N/A    foreach my $msidevkey (sort keys %msidevs) {
2N/A
2N/A        # Loop thru inos of the device, sorted by lowest value first
2N/A        # For each cookie found for a device, incr num_ino for the
2N/A        # lowest cookie and remove other cookies.
2N/A
2N/A        # Assumes PIL is the same for first and current cookies
2N/A
2N/A        my $first_ino = -1;
2N/A        my $first_cookiep;
2N/A        my $curr_cookiep;
2N/A        foreach my $inokey (sort keys %{$msidevs{$msidevkey}}) {
2N/A            $curr_cookiep = $msidevs{$msidevkey}{$inokey};
2N/A            if ($first_ino == -1) {
2N/A                $first_ino = $inokey;
2N/A                $first_cookiep = $curr_cookiep;
2N/A            } else {
2N/A                $$first_cookiep->{num_ino}++;
2N/A                $$first_cookiep->{time} +=
2N/A                    $$curr_cookiep->{time};
2N/A                if ($$curr_cookiep->{crtime} >
2N/A                    $$first_cookiep->{crtime}) {
2N/A                    $$first_cookiep->{crtime} =
2N/A                        $$curr_cookiep->{crtime};
2N/A                }
2N/A                # Invalidate this cookie, less complicated and
2N/A                # more efficient than deleting it.
2N/A                $$curr_cookiep->{num_ino} = 0;
2N/A            }
2N/A        }
2N/A    }
2N/A
2N/A    # We define the timerange as the amount of time spent gathering the
2N/A    # various kstats, divided by our sleeptime. If we take a lot of time
2N/A    # to access the kstats, and then we create a delta comparing these
2N/A    # kstats with a prior set of kstats, that delta will cover
2N/A    # substaintially different amount of time depending upon which
2N/A    # interrupt or CPU is being examined.
2N/A    #
2N/A    # By checking the timerange here, we guarantee that any deltas
2N/A    # created from these kstats will contain self-consistent data,
2N/A    # in that all CPUs and interrupts cover a similar span of time.
2N/A    #
2N/A    # $timerange_toohi is the upper bound. Any timerange above
2N/A    # this is thrown out as garbage. If the stat is safely within this
2N/A    # bound, we treat the stat as representing an instant in time, rather
2N/A    # than the time range it actually spans. We arbitrarily choose minsnap
2N/A    # as the snaptime of the stat.
2N/A
2N/A    $stat{snaptime} = $minsnap;
2N/A    my $timerange = ($maxsnap - $minsnap) / $sleeptime;
2N/A    return (0) if ($timerange > $timerange_toohi);  # i.e. failure
2N/A    return (\%stat);
2N/A}
2N/A
2N/A#
2N/A# dumpdelta takes a reference to our "delta" structure:
2N/A# {"missing"}           "1" if the delta's component stats had inconsistencies
2N/A# {"minsnap"}           time of the first kstat snaptime used in this delta
2N/A# {"maxsnap"}           time of the last kstat snaptime used in this delta
2N/A# {"goodness"}          cost function applied to this delta
2N/A# {"avgintrload"}       avg of interrupt load across cpus, as a percentage
2N/A# {"avgintrnsec"}       avg number of nsec spent in interrupts, per cpu
2N/A# {<cpuid>}             iterates over on-line cpus
2N/A#  ->{"intrs"}          cpu's movable intr time (sum of "time" for each ivec)
2N/A#  ->{"tot"}            CPU load from all sources in nsec
2N/A#  ->{"bigintr"}        largest value of {ivecs}{<ivec#>}{time} from below
2N/A#  ->{"intrload"}       intrs / tot
2N/A#  ->{"ivecs"}
2N/A#     ->{<ivec#>}       iterates over ivecs for this cpu
2N/A#        ->{"time"}     time used by this interrupt (in nsec)
2N/A#        ->{"pil"}      pil level of this interrupt
2N/A#        ->{"ino"}      interrupt number (or base vector if MSI group)
2N/A#        ->{"buspath"}  filename of the directory of the device's bus
2N/A#        ->{"name"}     device name
2N/A#        ->{"ihs"}      number of different handlers sharing this ino
2N/A#        ->{"num_ino"}  number of interrupt vectors in MSI group
2N/A#
2N/A# It prints out the delta structure in a nice, human readable display.
2N/A#
2N/A
2N/Asub dumpdelta($)
2N/A{
2N/A    my ($delta) = @_;
2N/A
2N/A    # print global info
2N/A
2N/A    syslog('debug', "dumpdelta:");
2N/A    syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0;
2N/A    syslog('debug', " avgintrload: %5.2f%%  avgintrnsec: %d",
2N/A           $delta->{avgintrload} * 100, $delta->{avgintrnsec});
2N/A    syslog('debug', "    goodness: %5.2f%%", $delta->{goodness} * 100)
2N/A        if exists($delta->{goodness});
2N/A
2N/A    # iterate over cpus
2N/A
2N/A    while (my ($cpu, $cpst) = each %$delta) {
2N/A        next if !ref($cpst);        # skip non-cpuid entries
2N/A        my $tot = $cpst->{tot};
2N/A        syslog('debug', "    cpu %3d intr %7.3f%%  (bigintr %7.3f%%)",
2N/A               $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot);
2N/A        syslog('debug', "        intrs %d, bigintr %d",
2N/A               $cpst->{intrs}, $cpst->{bigintr});
2N/A
2N/A        # iterate over ivecs on this cpu
2N/A
2N/A        while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) {
2N/A            syslog('debug', "    %15s:\"%s\": %7.3f%%  %d",
2N/A                ($ivst->{ihs} > 1 ? "$ivst->{name}($ivst->{ihs})" :
2N/A                $ivst->{name}), $ivec,
2N/A                $ivst->{time}*100 / $tot, $ivst->{time});
2N/A        }
2N/A    }
2N/A}
2N/A
2N/A#
2N/A# generate_delta($stat, $newstat) takes two stat references, returned from
2N/A# getstat(), and creates a %delta. %delta (not surprisingly) contains the
2N/A# same basic info as stat and newstat, but with the timestamps as deltas
2N/A# instead of absolute times. We return a reference to the delta.
2N/A#
2N/A
2N/Asub generate_delta($$)
2N/A{
2N/A    my ($stat, $newstat) = @_;
2N/A
2N/A    my %delta = ();
2N/A    my $intrload;
2N/A    my $intrnsec;
2N/A    my $cpus;
2N/A
2N/A    # Take the worstcase timerange
2N/A    $delta{minsnap} = $stat->{snaptime};
2N/A    $delta{maxsnap} = $newstat->{snaptime};
2N/A    if (VERIFY($delta{maxsnap} > $delta{minsnap},
2N/A        "generate_delta: stats aren't ascending")) {
2N/A        $delta{missing} = 1;
2N/A        return (\%delta);
2N/A    }
2N/A
2N/A    # if there are a different number of cpus in the stats, set missing
2N/A
2N/A    $delta{missing} = (keys(%$stat) != keys(%$newstat));
2N/A    if (VERIFY($delta{missing} == 0,
2N/A        "generate_delta: number of CPUs changed")) {
2N/A        return (\%delta);
2N/A    }
2N/A
2N/A    # scan through every cpu in %newstat and compare against %stat
2N/A
2N/A    while (my ($cpu, $newcpst) = each %$newstat) {
2N/A        next if !ref($newcpst);     # skip non-cpuid fields
2N/A
2N/A        # If %stat is missing a cpu from %newstat, then it was just
2N/A        # onlined. Mark missing.
2N/A
2N/A        if (VERIFY(exists $stat->{$cpu} &&
2N/A            $stat->{$cpu}{crtime} == $newcpst->{crtime},
2N/A            "generate_delta: cpu $cpu changed")) {
2N/A            $delta{missing} = 1;
2N/A            return (\%delta);
2N/A        }
2N/A        my $cpst = $stat->{$cpu};
2N/A        $delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot};
2N/A        if (VERIFY($delta{$cpu}{tot} >= 0,
2N/A            "generate_delta: deltas are not ascending?")) {
2N/A            $delta{missing} = 1;
2N/A            delete($delta{$cpu});
2N/A            return (\%delta);
2N/A        }
        # Avoid remote chance of division by zero
        $delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0;
        $delta{$cpu}{intrs} = 0;
        $delta{$cpu}{bigintr} = 0;

        my %ivecs = ();
        $delta{$cpu}{ivecs} = \%ivecs;

        # if the number of ivecs differs, set missing

        if (VERIFY(keys(%{$cpst->{ivecs}}) ==
               keys(%{$newcpst->{ivecs}}),
               "generate_delta: cpu $cpu has more/less".
               " interrupts")) {
            $delta{missing} = 1;
            return (\%delta);
        }

        while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) {

            # Unused cookie, corresponding to an MSI vector which
            # is part of a group.  The whole group is accounted for
            # by a different cookie.
            next if ($newivec->{num_ino} == 0);

            # If this ivec doesn't exist in $stat, or if $stat
            # shows a different crtime, set missing.
            if (VERIFY(exists $cpst->{ivecs}{$inum} &&
                   $cpst->{ivecs}{$inum}{crtime} ==
                   $newivec->{crtime},
                   "generate_delta: cpu $cpu inum $inum".
                   " has changed")) {
                $delta{missing} = 1;
                return (\%delta);
            }
            my $ivec = $cpst->{ivecs}{$inum};

            # Create $delta{$cpu}{ivecs}{$inum}.

            my %dltivec = ();
            $delta{$cpu}{ivecs}{$inum} = \%dltivec;

            # calculate time used by this interrupt

            my $time = $newivec->{time} - $ivec->{time};
            if (VERIFY($time >= 0,
                   "generate_delta: ivec went backwards?")) {
                $delta{missing} = 1;
                delete($delta{$cpu}{ivecs}{$inum});
                return (\%delta);
            }
            $delta{$cpu}{intrs} += $time;
            $dltivec{time} = $time;
            if ($time > $delta{$cpu}{bigintr}) {
                $delta{$cpu}{bigintr} = $time;
            }

            # Transfer over basic info about the kstat. We
            # don't have to worry about discrepancies between
            # ivec and newivec because we verified that both
            # have the same crtime.

            $dltivec{pil} = $newivec->{pil};
            $dltivec{ino} = $newivec->{ino};
            $dltivec{buspath} = $newivec->{buspath};
            $dltivec{name} = $newivec->{name};
            $dltivec{ihs} = $newivec->{ihs};
            $dltivec{num_ino} = $newivec->{num_ino};
        }
        if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) {
            # Ewww! Hopefully just a rounding error.
            # Make something up.
            $delta{$cpu}{tot} = $delta{$cpu}{intrs};
        }
        $delta{$cpu}{intrload} =
               $delta{$cpu}{intrs} / $delta{$cpu}{tot};
        $intrload += $delta{$cpu}{intrload};
        $intrnsec += $delta{$cpu}{intrs};
        $cpus++;
    }
    if ($cpus > 0) {
        $delta{avgintrload} = $intrload / $cpus;
        $delta{avgintrnsec} = $intrnsec / $cpus;
    } else {
        $delta{avgintrload} = 0;
        $delta{avgintrnsec} = 0;
    }
    return (\%delta);
}


# compress_delta takes a list of deltas, and returns a single new delta
# which represents the combined information from all the deltas. The deltas
# provided are assumed to be sequential in time. The resulting compressed
# delta looks just like any other delta. This new delta is also more accurate
# since its statistics are averaged over a longer period than any of the
# original deltas.

sub compress_deltas ($)
{
    my ($deltas) = @_;

    my %newdelta = ();
    my ($intrs, $tot);
    my $cpus = 0;
    my ($high_intrload) = 0;

    if (VERIFY($#$deltas != -1,
           "compress_deltas: list of delta is empty?")) {
        return (0);
    }
    $newdelta{minsnap} = $deltas->[0]{minsnap};
    $newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap};
    $newdelta{missing} = 0;

    foreach my $delta (@$deltas) {
        if (VERIFY($delta->{missing} == 0,
            "compressing bad deltas?")) {
            return (0);
        }
        while (my ($cpuid, $cpu) = each %$delta) {
            next if !ref($cpu);

            $intrs += $cpu->{intrs};
            $tot += $cpu->{tot};
            $newdelta{$cpuid}{intrs} += $cpu->{intrs};
            $newdelta{$cpuid}{tot} += $cpu->{tot};
            if (!exists $newdelta{$cpuid}{ivecs}) {
                my %ivecs = ();
                $newdelta{$cpuid}{ivecs} = \%ivecs;
            }
            while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
                my $newivecs = $newdelta{$cpuid}{ivecs};
                $newivecs->{$inum}{time} += $ivec->{time};
                $newivecs->{$inum}{pil} = $ivec->{pil};
                $newivecs->{$inum}{ino} = $ivec->{ino};
                $newivecs->{$inum}{buspath} = $ivec->{buspath};
                $newivecs->{$inum}{name} = $ivec->{name};
                $newivecs->{$inum}{ihs} = $ivec->{ihs};
                $newivecs->{$inum}{num_ino} = $ivec->{num_ino};
            }
        }
    }
    foreach my $cpu (values(%newdelta)) {
        next if !ref($cpu); # ignore non-cpu fields
        $cpus++;

        my $bigintr = 0;
        foreach my $ivec (values(%{$cpu->{ivecs}})) {
            if ($ivec->{time} > $bigintr) {
                $bigintr = $ivec->{time};
            }
        }
        $cpu->{bigintr} = $bigintr;
        $cpu->{intrload} = $cpu->{intrs} / $cpu->{tot};
        if ($high_intrload < $cpu->{intrload}) {
            $high_intrload = $cpu->{intrload};
        }
        $cpu->{tot} = 1 if $cpu->{tot} <= 0;
    }
    if ($cpus == 0) {
        $newdelta{avgintrnsec} = 0;
        $newdelta{avgintrload} = 0;
    } else {
        $newdelta{avgintrnsec} = $intrs / $cpus;
        $newdelta{avgintrload} = $intrs / $tot;
    }
    $sleeptime = ($high_intrload < $idle_intrload) ? $idle_sleeptime :
        $normal_sleeptime;
    return (\%newdelta);
}


# What follow are the core functions responsible for examining the deltas
# generated above and deciding what to do about them.
#
# goodness() and its helper goodness_cpu() return a heuristic which describe
# how good (or bad) the current interrupt balance is. The value returned will
# be between 0 and 1, with 0 representing maximum goodness, and 1 representing
# maximum badness.
#
# imbalanced() compares a current and historical value of goodness, and
# determines if there has been enough change to warrant evaluating a
# reconfiguration of the interrupts
#
# do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(),
# find_goal(), do_find_goal(), and move_intr(), are responsible for examining
# a delta and determining the best possible assignment of interrupts to CPUs.
#
# It is important that do_reconfig() be in alignment with goodness(). If
# do_reconfig were to generate a new interrupt distribution that worsened
# goodness, we could get into a pathological loop with intrd fighting itself,
# constantly deciding that things are imbalanced, and then changing things
# only to make them worse.


# any goodness over $goodness_unsafe_load is considered really bad
# goodness must drop by at least $goodness_mindelta for a reconfig

my $goodness_unsafe_load = .9;
my $goodness_mindelta = .1;

# goodness(%delta) examines a delta and return its "goodness". goodness will
# be between 0 (best) and 1 (major bad). goodness is determined by evaluating
# the goodness of each individual cpu, and returning the worst case. This
# helps on systems with many CPUs, where otherwise a single pathological CPU
# might otherwise be ignored because the average was OK.
#
# To calculate the goodness of an individual CPU, we start by looking at its
# load due to interrupts. If the load is above a certain high threshold and
# there is more than one interrupt assigned to this CPU, we set goodness
# to worst-case. If the load is below the average interrupt load of all CPUs,
# then we return best-case, since what's to complain about?
#
# Otherwise we look at how much the load is above the average, and return
# that as the goodness, with one caveat: we never return more than the CPU's
# interrupt load ignoring its largest single interrupt source. This is
# because a CPU with one high-load interrupt, and no other interrupts, is
# perfectly balanced. Nothing can be done to improve the situation, and thus
# it is perfectly balanced even if the interrupt's load is 100%.

sub goodness($)
{
    my ($delta) = @_;

    return (1) if $delta->{missing} > 0;

    my $high_goodness = 0;
    my $goodness;

    foreach my $cpu (values(%$delta)) {
        next if !ref($cpu);     # skip non-cpuid fields

        $goodness = goodness_cpu($cpu, $delta->{avgintrload});
        if (VERIFY($goodness >= 0 && $goodness <= 1,
               "goodness: cpu goodness out of range?")) {
            dumpdelta($delta);
            return (1);
        }
        if ($goodness == 1) {
            return (1); # worst case, no need to continue
        }
        if ($goodness > $high_goodness) {
            $high_goodness = $goodness;
        }
    }
    return ($high_goodness);
}

sub goodness_cpu($$)        # private function
{
    my ($cpu, $avgintrload) = @_;

    my $goodness;
    my $load = $cpu->{intrs} / $cpu->{tot};

    return (0) if ($load < $avgintrload);   # low loads are perfectly good

    # Calculate $load_no_bigintr, which represents the load
    # due to interrupts, excluding the one biggest interrupt.
    # This is the most gain we can get on this CPU from
    # offloading interrupts.

    my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot};

    # A major imbalance is indicated if a CPU is saturated
    # with interrupt handling, and it has more than one
    # source of interrupts. Those other interrupts could be
    # starved if of a lower pil. Return a goodness of 1,
    # which is the worst possible return value,
    # which will effectively contaminate this entire delta.

    my $cnt = keys(%{$cpu->{ivecs}});

    if ($load > $goodness_unsafe_load && $cnt > 1) {
        return (1);
    }
    $goodness = $load - $avgintrload;
    if ($goodness > $load_no_bigintr) {
        $goodness = $load_no_bigintr;
    }
    return ($goodness);
}


# imbalanced() is used by the main routine to determine if the goodness
# has shifted far enough from our last baseline to warrant a reassignment
# of interrupts. A very high goodness indicates that a CPU is way out of
# whack. If the goodness has varied too much since the baseline, then
# perhaps a reconfiguration is worth considering.

sub imbalanced ($$)
{
    my ($goodness, $baseline) = @_;

    # Return 1 if we are pathological, or creeping away from the baseline

    return (1) if $goodness > .50;
    return (1) if abs($goodness - $baseline) > $goodness_mindelta;
    return (0);
}

# do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the
# decision-making functions responsible for generating a new interrupt
# distribution. They are designed with the definition of goodness() in
# mind, i.e. they use the same definition of "good distribution" as does
# goodness().
#
# do_reconfig() is responsible for deciding whether a redistribution is
# actually warranted. If the goodness is already pretty good, it doesn't
# waste the CPU time to generate a new distribution. If it
# calculates a new distribution and finds that it is not sufficiently
# improved from the prior distirbution, it will not do the redistribution,
# mainly to avoid the disruption to system performance caused by
# rejuggling interrupts.
#
# Its main loop works by going through a list of cpus sorted from
# highest to lowest interrupt load. It removes the highest-load cpus
# one at a time and hands them off to do_reconfig_cpu(). This function
# then re-sorts the remaining CPUs from lowest to highest interrupt load,
# and one at a time attempts to rejuggle interrupts between the original
# high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is
# considered finished as soon as its interrupt load is within
# $goodness_mindelta of the average interrupt load. Such a CPU will have
# a goodness of below the $goodness_mindelta threshold.

#
# move_intr(\%delta, $inum, $oldcpu, $newcpu)
# used by reconfiguration code to move an interrupt between cpus within
# a delta. This manipulates data structures, and does not actually move
# the interrupt on the running system.
#
sub move_intr($$$$)     # private function
{
    my ($delta, $inum, $oldcpuid, $newcpuid) = @_;

    my $ivec = $delta->{$oldcpuid}{ivecs}{$inum};

    # Remove ivec from old cpu

    my $oldcpu = $delta->{$oldcpuid};
    $oldcpu->{intrs} -= $ivec->{time};
    $oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot};
    delete($oldcpu->{ivecs}{$inum});

    VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?");
    VERIFY($ivec->{time} <= $oldcpu->{bigintr},
           "move_intr: intr's time > bigintr?");

    if ($ivec->{time} >= $oldcpu->{bigintr}) {
        my $bigtime = 0;

        foreach my $ivec (values(%{$oldcpu->{ivecs}})) {
            $bigtime = $ivec->{time} if $ivec->{time} > $bigtime;
        }
        $oldcpu->{bigintr} = $bigtime;
    }

    # Add ivec onto new cpu

    my $newcpu = $delta->{$newcpuid};

    $ivec->{nowcpu} = $newcpuid;
    $newcpu->{intrs} += $ivec->{time};
    $newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot};
    $newcpu->{ivecs}{$inum} = $ivec;

    $newcpu->{bigintr} = $ivec->{time}
        if $ivec->{time} > $newcpu->{bigintr};
}

sub move_intr_check($$$)    # private function
{
    my ($delta, $oldcpuid, $newcpuid) = @_;

    VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs},
           "Moved interrupts left 100+%% load on src cpu");
    VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs},
           "Moved interrupts left 100+%% load on tgt cpu");
}

sub ivecs_to_string(@)      # private function
{
    my $str = "";
    foreach my $ivec (@_) {
        $str = "$str $ivec->{inum}";
    }
    return ($str);
}


sub do_reconfig($)
{
    my ($delta) = @_;

    my $goodness = $delta->{goodness};

    # We can't improve goodness to better than 0. We should stop here
    # if, even if we achieve a goodness of 0, the improvement is still
    # too small to merit the action.

    if ($goodness - 0 < $goodness_mindelta) {
        syslog('debug', "goodness good enough, don't reconfig");
        return (0);
    }

    syslog('notice', "Optimizing interrupt assignments");

    if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ".
        "have a delta with missing")) {
        return (-1);
    }

    # Make a list of all cpuids, and also add some extra information
    # to the ivec structures.

    my @cpusortlist = ();

    while (my ($cpuid, $cpu) = each %$delta) {
        next if !ref($cpu); # skip non-cpu entries

        push(@cpusortlist, $cpuid);
        while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
            $ivec->{origcpu} = $cpuid;
            $ivec->{nowcpu} = $cpuid;
            $ivec->{inum} = $inum;
        }
    }

    # Sort the list of CPUs from highest to lowest interrupt load.
    # Remove the top CPU from that list and attempt to redistribute
    # its interrupts. If the CPU has a goodness below a threshold,
    # just ignore the CPU and move to the next one. If the CPU's
    # load falls below the average load plus that same threshold,
    # then there are no CPUs left worth reconfiguring, and we're done.

    while (@cpusortlist) {
        # Re-sort cpusortlist each time, since do_reconfig_cpu can
        # move interrupts around.

        @cpusortlist =
            sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}}
            @cpusortlist);

        my $cpu = shift(@cpusortlist);
        if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) &&
            ($delta->{$cpu}{intrload} <=
            $delta->{avgintrload} + $goodness_mindelta)) {
            syslog('debug', "finished reconfig: cpu $cpu load ".
                "$delta->{$cpu}{intrload} avgload ".
                "$delta->{avgintrload}");
            last;
        }
        if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) <
            $goodness_mindelta) {
            next;
        }
        do_reconfig_cpu($delta, \@cpusortlist, $cpu);
    }

    # How good a job did we do? If the improvement was minimal, and
    # our goodness wasn't pathological (and thus needing any help it
    # can get), then don't bother moving the interrupts.

    my $newgoodness = goodness($delta);
    VERIFY($newgoodness <= $goodness,
           "reconfig: result has worse goodness?");

    if (($goodness != 1 || $newgoodness == 1) &&
        $goodness - $newgoodness < $goodness_mindelta) {
        syslog('debug', "goodness already near optimum, ".
               "don't reconfig");
        return (0);
    }
    syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100,
           $newgoodness*100);

    # Time to move those interrupts!

    my $ret = 1;
    my $warned = 0;
    while (my ($cpuid, $cpu) = each %$delta) {
        next if $cpuid =~ /\D/;
        while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
            next if ($ivec->{origcpu} == $cpuid);

            if (!intrmove($ivec->{buspath}, $ivec->{origcpu},
                $ivec->{ino}, $cpuid, $ivec->{num_ino})) {
                syslog('warning', "Unable to move interrupts")
                    if $warned++ == 0;
                syslog('debug', "Unable to move buspath ".
                    "$ivec->{buspath} ino $ivec->{ino} to ".
                    "cpu $cpuid");
                $ret = -1;
            }
        }
    }

    syslog('notice', "Interrupt assignments optimized");
    return ($ret);
}

sub do_reconfig_cpu($$$)    # private function
{
    my ($delta, $cpusortlist, $oldcpuid) = @_;

    # We have been asked to rejuggle interrupts between $oldcpuid and
    # other CPUs found on $cpusortlist so as to improve the load on
    # $oldcpuid. We reverse $cpusortlist to get our own copy of the
    # list, sorted from lowest to highest interrupt load. One at a
    # time, shift a CPU off of this list of CPUs, and attempt to
    # rejuggle interrupts between the two CPUs. Don't do this if the
    # other CPU has a higher load than oldcpuid. We're done rejuggling
    # once $oldcpuid's goodness falls below a threshold.

    syslog('debug', "reconfiguring $oldcpuid");

    my $cpu = $delta->{$oldcpuid};
    my $avgintrload = $delta->{avgintrload};

    my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list
    while ($#cputargetlist != -1) {
        last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta;

        my $tgtcpuid = shift(@cputargetlist);
        my $tgt = $delta->{$tgtcpuid};
        my $load = $cpu->{intrload};
        my $tgtload = $tgt->{intrload};
        last if $tgtload > $load;
        do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load);
    }
}

sub do_reconfig_cpu2cpu($$$$)   # private function
{
    my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_;

    # We've been asked to consider interrupt juggling between srccpuid
    # (with a high interrupt load) and tgtcpuid (with a lower interrupt
    # load). First, make a single list with all of the ivecs from both
    # CPUs, and sort the list from highest to lowest load.

    syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid");

    # Gather together all the ivecs and sort by load

    my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}),
        values(%{$delta->{$tgtcpuid}{ivecs}}));
    return if $#ivecs == -1;

    @ivecs = sort({$b->{time} <=> $a->{time}} @ivecs);

    # Our "goal" load for srccpuid is the average load across all CPUs.
    # find_goal() will find determine the optimum selection of the
    # available interrupts which comes closest to this goal without
    # falling below the goal.

    my $goal = $delta->{avgintrnsec};

    # We know that the interrupt load on tgtcpuid is less than that on
    # srccpuid, but its load could still be above avgintrnsec. Don't
    # choose a goal which would bring srccpuid below the load on tgtcpuid.

    my $avgnsec =
        ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2;
    if ($goal < $avgnsec) {
        $goal = $avgnsec;
    }

    # If the largest of the interrupts is on srccpuid, leave it there.
    # This can help minimize the disruption caused by moving interrupts.

    if ($ivecs[0]->{origcpu} == $srccpuid) {
        syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid");
        $goal -= $ivecs[0]->{time};
        shift(@ivecs);
    }

    syslog('debug', "GOAL: inums should total $goal");
    find_goal(\@ivecs, $goal);

    # find_goal() returned its results to us by setting $ivec->{goal} if
    # the ivec should be on srccpuid, or clearing it for tgtcpuid.
    # Call move_intr() to update our $delta with the new results.

    foreach my $ivec (@ivecs) {
        syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}");
        VERIFY($ivec->{nowcpu} == $srccpuid ||
            $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ".
            "interrupt not currently on src or tgt cpu");

        if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) {
            move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
                $srccpuid);
        } elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) {
            move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
                $tgtcpuid);
        }
    }
    move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts

    my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot};
    VERIFY($newload <= $srcload && $newload > $delta->{avgintrload},
        "cpu2cpu: new load didn't end up in expected range");
}


# find_goal() and its helper do_find_goal() are used to find the best
# combination of interrupts in order to generate a load that is as close
# as possible to a goal load without falling below that goal. Before returning
# to its caller, find_goal() sets a new value in the hash of each interrupt,
# {goal}, which if set signifies that this interrupt is one of the interrupts
# identified as part of the set of interrupts which best meet the goal.
#
# The arguments to find_goal are a list of ivecs (hash references), sorted
# by descending {time}, and the goal load. The goal is relative to {time}.
# The best fit is determined by performing a depth-first search. do_find_goal
# is the recursive subroutine which carries out the search.
#
# It is passed an index as an argument, originally 0. On a given invocation,
# it is only to consider interrupts in the ivecs array starting at that index.
# It then considers two possibilities:
#   1) What is the best goal-fit if I include ivecs[index]?
#   2) What is the best goal-fit if I exclude ivecs[index]?
# To determine case 1, it subtracts the load of ivecs[index] from the goal,
# and calls itself recursively with that new goal and index++.
# To determine case 2, it calls itself recursively with the same goal and
# index++.
#
# It then compares the two results, decide which one best meets the goals,
# and returns the result. The return value is the best-fit's interrupt load,
# followed by a list of all the interrupts which make up that best-fit.
#
# As an optimization, a second array loads[] is created which mirrors ivecs[].
# loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used
# by do_find_goal to avoid recursing all the way to the end of the ivecs
# array if including all remaining interrupts will still leave the best-fit
# at below goal load. If so, it then includes all remaining interrupts on
# the goal list and returns.
#
sub find_goal($$)       # private function
{
    my ($ivecs, $goal) = @_;

    my @goals;
    my $load;
    my $ivec;

    if ($goal <= 0) {
        @goals = ();    # the empty set will best meet the goal
    } else {
        syslog('debug', "finding goal from intrs %s",
            ivecs_to_string(@$ivecs));

        # Generate @loads array

        my $tot = 0;
        foreach $ivec (@$ivecs) {
            $tot += $ivec->{time};
        }
        my @loads = ();
        foreach $ivec (@$ivecs) {
            push(@loads, $tot);
            $tot -= $ivec->{time};
        }
        ($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0);
        VERIFY($load >= $goal, "find_goal didn't meet goals");
    }
    syslog('debug', "goals found: %s", ivecs_to_string(@goals));

    # Set or clear $ivec->{goal} for each ivec, based on returned @goals

    foreach $ivec (@$ivecs) {
        if ($#goals > -1 && $ivec == $goals[0]) {
            syslog('debug', "inum $ivec->{inum} on source cpu");
            $ivec->{goal} = 1;
            shift(@goals);
        } else {
            syslog('debug', "inum $ivec->{inum} on target cpu");
            $ivec->{goal} = 0;
        }
    }
}


sub do_find_goal($$$$)      # private function
{
    my ($ivecs, $loads, $goal, $idx) = @_;

    if ($idx > $#{$ivecs}) {
        return (0);
    }
    syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}");

    my $load = $ivecs->[$idx]{time};
    my @goals_with = ();
    my @goals_without = ();
    my ($with, $without);

    # If we include all remaining items and we're still below goal,
    # stop here. We can just return a result that includes $idx and all
    # subsequent ivecs. Since this will still be below goal, there's
    # nothing better to be done.

    if ($loads->[$idx] <= $goal) {
        syslog('debug',
            "$idx: including all remaining intrs %s with load %d",
            ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]),
            $loads->[$idx]);
        return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]);
    }

    # Evaluate the "with" option, i.e. the best matching goal which
    # includes $ivecs->[$idx]. If idx's load is more than our goal load,
    # stop here. Once we're above the goal, there is no need to consider
    # further interrupts since they'll only take us further from the goal.

    if ($goal <= $load) {
        $with = $load;  # stop here
    } else {
        ($with, @goals_with) =
            do_find_goal($ivecs, $loads, $goal - $load, $idx + 1);
        $with += $load;
    }
    syslog('debug', "$idx: with-load $with intrs %s",
           ivecs_to_string($ivecs->[$idx], @goals_with));

    # Evaluate the "without" option, i.e. the best matching goal which
    # excludes $ivecs->[$idx].

    ($without, @goals_without) =
        &do_find_goal($ivecs, $loads, $goal, $idx + 1);
    syslog('debug', "$idx: without-load $without intrs %s",
           ivecs_to_string(@goals_without));

    # We now have our "with" and "without" options, and we choose which
    # best fits the goal. If one is greater than goal and the other is
    # below goal, we choose the one that is greater. If they are both
    # below goal, then we choose the one that is greater. If they are
    # both above goal, then we choose the smaller.

    my $which;      # 0 == with, 1 == without
    if ($with >= $goal && $without < $goal) {
        $which = 0;
    } elsif ($with < $goal && $without >= $goal) {
        $which = 1;
    } elsif ($with >= $goal && $without >= $goal) {
        $which = ($without < $with);
    } else {
        $which = ($without > $with);
    }

    # Return the load of our best case scenario, followed by all the ivecs
    # which compose that goal.

    if ($which == 1) {  # without
        syslog('debug', "$idx: going without");
        return ($without, @goals_without);
    } else {
        syslog('debug', "$idx: going with");
        return ($with, $ivecs->[$idx], @goals_with);
    }
    # Not reached
}


syslog('debug', "intrd is starting".($debug ? " (debug)" : ""));

my @deltas = ();
my $deltas_tottime = 0;     # sum of maxsnap-minsnap across @deltas
my $avggoodness;
my $baseline_goodness = 0;
my $compdelta;

my $do_reconfig;

# temp variables
my $goodness;
my $deltatime;
my $olddelta;
my $olddeltatime;
my $delta;
my $newstat;
my $below_statslen;
my $newtime;
my $ret;


my $gotsig = 0;
$SIG{INT} = sub { $gotsig = 1; };     # don't die in the middle of retargeting
$SIG{HUP} = $SIG{INT};
$SIG{TERM} = $SIG{INT};

my $ks;
if ($using_scengen == 0) {
    $ks = Sun::Solaris::Kstat->new();
} else {
    $ks = myks_update();    # supplied by the simulator
}

# If no pci_intrs kstats were found, we need to exit, but we can't because
# SMF will restart us and/or report an error to the administrator. But
# there's nothing an administrator can do. So print out a message for SMF
# logs and silently pause forever.

if (!exists($ks->{pci_intrs})) {
    print STDERR "$cmdname: no interrupts were found; ".
        "your PCI bus may not yet be supported\n";
    pause() while $gotsig == 0;
    exit 0;
}

# See if this is a system with a pcplusmp APIC.
# Such systems will get special handling.
# Assume that if one bus has a pcplusmp APIC that they all do.

# Get a list of pci_intrs kstats.
my @elem = values(%{$ks->{pci_intrs}});
my $elem0 = $elem[0];
my $elemval = (values(%$elem0))[0];

# Use its buspath to query the system.  It is assumed that either all or none
# of the busses on a system are hosted by the pcplusmp APIC or APIX.
my $pcplusmp_sys = is_apic($elemval->{buspath});

my $stat = getstat($ks, $pcplusmp_sys);

for (;;) {
    sub clear_deltas {
        @deltas = ();
        $deltas_tottime = 0;
        $stat = 0;   # prevent next gen_delta() from setting {missing}
    }

    # 1. Sleep, update the kstats, and save the new stats in $newstat.

    exit 0 if $gotsig;      # if we got ^C / SIGTERM, exit
    if ($using_scengen == 0) {
        sleep($sleeptime);
        exit 0 if $gotsig;  # if we got ^C / SIGTERM, exit
        $ks->update();
    } else {
        $ks = myks_update();
    }
    $newstat = getstat($ks, $pcplusmp_sys);

    # $stat or $newstat could be zero if they're uninitialized, or if
    # getstat() failed. If $stat is zero, move $newstat to $stat, sleep
    # and try again. If $newstat is zero, then we also sleep and try
    # again, hoping the problem will clear up.

    next if (!ref $newstat);
    if (!ref $stat) {
        $stat = $newstat;
        next;
    }

    # 2. Compare $newstat with the prior set of values, result in %$delta.

    $delta = generate_delta($stat, $newstat);
    dumpdelta($delta) if $debug;    # Dump most recent stats to stdout.
    $stat = $newstat;   # The new stats now become the old stats.


    # 3. If $delta->{missing}, then there has been a reconfiguration of
    # either cpus or interrupts (probably both). We need to toss out our
    # old set of statistics and start from scratch.
    #
    # Also, if the delta covers a very long range of time, then we've
    # been experiencing a system overload that has resulted in intrd
    # not being allowed to run effectively for a while now. As above,
    # toss our old statistics and start from scratch.

    $deltatime = $delta->{maxsnap} - $delta->{minsnap};
    if ($delta->{missing} > 0 || $deltatime > $statslen) {
        clear_deltas();
        syslog('debug', "evaluating interrupt assignments");
        next;
    }


    # 4. Incorporate new delta into the list of deltas, and associated
    # statistics. If we've just now received $statslen deltas, then it's
    # time to evaluate a reconfiguration.

    $below_statslen = ($deltas_tottime < $statslen);
    $deltas_tottime += $deltatime;
    $do_reconfig = ($below_statslen && $deltas_tottime >= $statslen);
    push(@deltas, $delta);

    # 5. Remove old deltas if total time is more than $statslen. We use
    # @deltas as a moving average of the last $statslen seconds. Shift
    # off the olders deltas, but only if that doesn't cause us to fall
    # below $statslen seconds.

    while (@deltas > 1) {
        $olddelta = $deltas[0];
        $olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap};
        $newtime = $deltas_tottime - $olddeltatime;
        last if ($newtime < $statslen);

        shift(@deltas);
        $deltas_tottime = $newtime;
    }

    # 6. The brains of the operation are here. First, check if we're
    # imbalanced, and if so set $do_reconfig. If $do_reconfig is set,
    # either because of imbalance or above in step 4, we evaluate a
    # new configuration.
    #
    # First, take @deltas and generate a single "compressed" delta
    # which summarizes them all. Pass that to do_reconfig and see
    # what it does with it:
    #
    # $ret == -1 : failure
    # $ret ==  0 : current config is optimal (or close enough)
    # $ret ==  1 : reconfiguration has occurred
    #
    # If $ret is -1 or 1, dump all our deltas and start from scratch.
    # Step 4 above will set do_reconfig soon thereafter.
    #
    # If $ret is 0, then nothing has happened because we're already
    # good enough. Set baseline_goodness to current goodness.

    $compdelta = compress_deltas(\@deltas);
    if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) {
        clear_deltas();
        next;
    }
    $compdelta->{goodness} = goodness($compdelta);
    dumpdelta($compdelta) if $debug;

    $goodness = $compdelta->{goodness};
    syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100);

    if ($deltas_tottime >= $statslen &&
        imbalanced($goodness, $baseline_goodness)) {
        $do_reconfig = 1;
    }

    if ($do_reconfig) {
        $ret = do_reconfig($compdelta);

        if ($ret != 0) {
            clear_deltas();
            syslog('debug', "do_reconfig FAILED!") if $ret == -1;
        } else {
            syslog('debug', "setting new baseline of $goodness");
            $baseline_goodness = $goodness;
        }
    }
    syslog('debug', "---------------------------------------");
}