#!/usr/bin/perl

# **** License ****
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# This code was originally developed by Vyatta, Inc.
# Portions created by Vyatta are Copyright (C) 2009,2010 Vyatta, Inc.
# All Rights Reserved.
#
# **** End License ****
#
use warnings;
use strict;
use Sys::Syslog qw(:standard :macros);

my $PATH_SYS_SYSTEM = "/sys/devices/system";
my $PATH_SYS_CPU0   = $PATH_SYS_SYSTEM . "/cpu/cpu0";

die "Usage: $0 ifname {auto | mask} { debug }\n" if ($#ARGV < 1);

my ($ifname, $mask, $debug)  = @ARGV;

die "Error: Interface $ifname does not exist\n"
    unless -d "/sys/class/net/$ifname";

my $logopt = defined($debug) ? "perror" : "";
openlog("irq-affinity", $logopt, LOG_LOCAL0);

my ($cpus, undef, $threads) = cpuinfo();

if ($mask eq 'auto') {
    affinity_auto($ifname);
} else {
    affinity_mask($ifname, $mask);
}

exit 0;

# Get current irq assignments by reading /proc/interrupts
# returns reference to hash of interrupt infromation for given interface
# i.e.  {'eth1'} => 22, {'eth1-tx-1'} => 31, ...
#
# Code based on parsing in irqbalance program
#
# Format of /proc/interrupts is:
#
#            CPU0       CPU1       
#  72:       1637          0   PCI-MSI-edge      eth3
sub irqinfo {
    my $ifname = shift;
    my $irqmap;

    open( my $f, '<', "/proc/interrupts" )
      or die "Can't read /proc/interrupts";

    # first line is the header we don't need
    <$f>;

    while (<$f>) {
        chomp;

	# lines with letters in front are special, like NMI count. 
	#
        # First column is IRQ number (and colon)
	# after that match as many entries with digits
        last unless /^\s*(\d+):\s/;
	my $irq = $1;

	my @cols = split;

	# skip the irq number and all counts
	do {
	    shift @cols;
	} while ($cols[0] =~ /^\d+$/);

	foreach my $name ( @cols ) {
            $name =~ s/,$//;

	    next unless ($name eq $ifname || $name =~ /^$ifname-/ );

	    $irqmap->{$name} = $irq;
        }
    }
    close $f;

    return $irqmap;
}

# compute number of bits set
sub hweight {
    my $bits = shift;
    my $count = 0;

    for (; $bits > 0; $bits >>= 1) {
	++$count if ($bits & 1);
    }
    return $count;
}

# count the bits set in a mapping file
sub path_sibling {
    my $path = shift;
    my $result = 0;

    open (my $f, '<', $path)
	or die "can't open $path : $!";

    my $line = <$f>;
    close $f;
    chomp $line;

    for my $mask (split(/,/, $line)) {
	$result += hweight(hex($mask));
    }

    return $result;
}

# Determine number of cpu topology information
#
# This algorithm is based on the command lscpu from util-linux
# it cases like multiple socket, offline cpus, etc
sub cpuinfo {
    my $cpu = 0;

    # XXX doesn't handle offline CPU's...
    while ( -e $PATH_SYS_SYSTEM . '/cpu/cpu' . $cpu ) {
	++$cpu;
    }

    my $thread = path_sibling($PATH_SYS_CPU0 . '/topology/thread_siblings');
    my $core   = path_sibling($PATH_SYS_CPU0 . '/topology/core_siblings') / $thread;
    my $socket = $cpu / $core / $thread;
    
    syslog(LOG_DEBUG, "cpus=%d cores=%d threads=%d sockets=%d\n", 
	   $cpu, $core, $thread, $socket);

    return ($cpu, $core, $thread);
}

# Set affinity value for a irq
sub set_affinity {
    my ( $ifname, $irq, $mask ) = @_;
    my $smp_affinity = "/proc/irq/$irq/smp_affinity";

    syslog(LOG_DEBUG, "%s: irq %d affinity set to 0x%x", $ifname, $irq, $mask);

    open( my $f, '>', $smp_affinity )
      or die "Can't open: $smp_affinity : $!\n";
    printf {$f} "%x\n", $mask;
    close $f;
}

# set Receive Packet Steering mask
sub set_rps {
    my ( $ifname, $q, $mask ) = @_;

    # ignore if older kernel without RPS
    my $rxq = "/sys/class/net/$ifname/queues";
    return unless ( -d $rxq );

    syslog(LOG_INFO, "%s: receive queue %d cpus set to 0x%x",
	   $ifname, $q, $mask);

    my $rps_cpus = "$rxq/rx-$q/rps_cpus";
    open( my $f, '>', $rps_cpus )
	or die "Can't open: $rps_cpus : $!\n";
    printf {$f} "%x\n", $mask;
    close $f;
}

# Check if the current if this cpu is in the banned mask
# Uses environment variable VYATTA_IRQAFFINITY_BANNED_CPUS
#  to mask cpus which irq affinity script should ignore
sub skip_cpu {
    my $cpu = shift;
    my $banned = $ENV{'VYATTA_IRQAFFINITY_BANNED_CPUS'};

    return unless defined($banned);	# false

    return ((1 << $cpu) & hex($banned)) != 0;
}

# For multi-queue NIC choose next cpu to be on next core
# XXX doesn't handle offline CPU's...
sub next_cpu {
    my $origcpu = shift;
    my $cpu = $origcpu;

    do {
	$cpu += $threads;
	if ( $cpu >= $cpus ) {
	    # wraparound to next thread on core 0
	    $cpu = ($cpu + 1) % $threads;
	}
    } while ($cpu != $origcpu && skip_cpu($cpu));

    return $cpu;
}

# Get cpu to assign for the queues for single queue nic
sub choose_cpu {
    my $ifname = shift;

    # For single-queue nic choose IRQ based on name
    #   Ideally should make decision on least loaded CPU
    my ($ifunit) = ($ifname =~ m/^[a-z]*(\d+)$/);
    die "can't find number for $ifname\n"
	unless defined($ifunit);

    # Give the load first to one CPU of each hyperthreaded core, then
    # if there are enough NICs, give the load to the other CPU of
    # each core.
    # XXX doesn't handle offline CPU's...
    my $ht_wrap = (($ifunit * $threads) / $cpus) % $threads;
    my $cpu = ((($ifunit * $threads) + $ht_wrap) % $cpus);

    $cpu = next_cpu($cpu) if skip_cpu($cpu);
    return $cpu;
}

# Assignment for multi-queue NICs
sub assign_multiqueue {
    my $ifname = shift;
    my $irqmap = shift;
    my $numq = scalar(@_);
    my $cpu;

    if ($numq == 1) {
	# This is a single-queue NIC using the multi-queue naming
	# format.  In this case, we use the same algorithm to select
	# the CPU as we use for standard single-queue NICs.  This
	# algorithm spreads the work of different NICs accross
	# different CPUs.

	$cpu = choose_cpu($ifname);
    } else {
	# For multi-queue nic's always starts with CPU 0
	#   This is less than ideal when there are more core's available
	#   than number of queues (probably should barber pole);
	#   but the Intel IXGBE needs CPU 0 <-> queue 0 
	#   because of flow director bug.

	$cpu = 0;
    }

    foreach my $name (sort @_) {
	my $irq = $irqmap->{$name};

	die "Can't find irq in map for $name\n" unless $irq;

	syslog(LOG_INFO, "%s: assign %s to cpu %d",
	       $ifname, $name, $cpu );

	# Assign CPU affinity for both IRQs
	set_affinity( $ifname, $irq, 1 << $cpu );

	# TODO use RPS to steer data if cores > queues?
	$cpu = next_cpu($cpu);
    }
}

# Affinity assignment function for single-queue NICs.  The strategy
# here is to just spread the interrupts of different NICs evenly
# across all CPUs.  That is the best we can do without monitoring the
# load and traffic patterns.  So we just directly map the NIC unit
# number into a CPU number.
sub assign_single {
    my ( $ifname, $irq ) = @_;
    my $cpu = choose_cpu($ifname);

    syslog(LOG_INFO,"%s: assign irq %d to cpu %d", $ifname, $irq, $cpu );

    set_affinity( $ifname, $irq, 1 << $cpu );

    if ($threads > 1) {
	# Use both threads on this cpu if hyperthreading
	my $mask = ((1 << $threads) - 1) << $cpu;
	set_rps($ifname, 0, $mask);
    }
    # MAYBE - Use all cpu's if no HT
}

# Mask must contain at least one CPU and
# no bits outside of range of available CPU's
sub check_mask {
    my ($ifname, $name, $mask) = @_;
    my $m = hex($mask);

    die "$ifname: $name mask $mask has no bits set\n"
	if ($m == 0);

    die "$ifname: $name mask $mask too large for number of CPU's: $cpus\n"
	if ($m >= 1 << $cpus);
}

# Set affinity (and RPS) based on mask
sub affinity_mask {
    my ($ifname, $mask) = @_;

    # match on <hex> or <hex>,<hex>
    unless ($mask =~ /^([0-9a-f]+)(|,([0-9a-f]+))$/) {
	die "$ifname: irq mask $mask is not a valid affinity mask\n"
    }

    my $irqmsk = $1;
    my $rpsmsk = $3;

    check_mask($ifname, "irq", $irqmsk);
    check_mask($ifname, "rps", $rpsmsk) if $rpsmsk;

    my $irqmap = irqinfo($ifname);
    while (my ($name, $irq) = each (%{$irqmap})) {
	syslog(LOG_INFO, "%s: assign irq %d mask %s", $name, $irq, $irqmsk);
	set_affinity($name, $irq, hex($irqmsk));
    }

    set_rps($ifname, 0, hex($rpsmsk)) if $rpsmsk;
}

# The auto strategy involves trying to achieve the following goals:
#
#  - Spread the receive load among as many CPUs as possible.
#
#  - For all multi-queue NICs in the system that provide both tx and
#    rx queues, keep all of the queues that share the same queue
#    number on same CPUs.  I.e. tx and rx queue 0 of all such NICs
#    should interrupt one CPU; tx and rx queue 1 should interrupt a
#    different CPU, etc.
#
#  - If hyperthreading is supported and enabled, avoid assigning
#    queues to both CPUs of a hyperthreaded pair if there are enough
#    CPUs available to do that.
sub affinity_auto {
    my $ifname   = shift;

    my $irqmap = irqinfo($ifname);
    my @irqnames = keys %{$irqmap};
    my $numirq = scalar(@irqnames);

    # Figure out what style of irq naming is being used
    if ( $numirq == 1 ) {
	my $irq = $irqmap->{$ifname};
	assign_single( $ifname, $irq) if $irq;
    } elsif ($numirq > 1) {
	# Special case for paired Rx and Tx
	my @mirq = grep { /^$ifname-rx-/ } @irqnames;
	my $num_mirq = scalar(@mirq);
        if ( $num_mirq > 0 ) {
	    assign_multiqueue( $ifname, $irqmap, @mirq );

	    @mirq = grep { /^$ifname-tx-/ } @irqnames;
	    assign_multiqueue( $ifname, $irqmap, @mirq );
	    return;
	}

	# Normal case for single irq per queue
	@mirq = grep { /^$ifname-/ } @irqnames;
	$num_mirq = scalar(@mirq);
	if ( $num_mirq > 0 ) {
	    assign_multiqueue( $ifname, $irqmap, @mirq );
	    return;
	}

	# Netxen thought up yet another convention
	@mirq = grep { /^$ifname\[/ } @irqnames;
	if ( $#mirq > 0 ) {
	    assign_multiqueue( $ifname, $irqmap, @mirq );
	    return;
	}

	syslog(LOG_ERR, "%s: Unknown multiqueue irq naming: %s\n", $ifname,
	       join(' ', @irqnames));
    }
}