diff options
author | Stephen Hemminger <stephen.hemminger@vyatta.com> | 2010-10-27 10:26:39 -0700 |
---|---|---|
committer | Stephen Hemminger <stephen.hemminger@vyatta.com> | 2010-10-27 10:26:39 -0700 |
commit | ab75c62a0c22d95512cf7792370f5552ac5da39e (patch) | |
tree | e03aeeebe7eafa73d639198c2f313d702345c6af /scripts/system | |
parent | 6794fb6c22fc79f5f31040a00723508b888950a3 (diff) | |
download | vyatta-cfg-quagga-ab75c62a0c22d95512cf7792370f5552ac5da39e.tar.gz vyatta-cfg-quagga-ab75c62a0c22d95512cf7792370f5552ac5da39e.zip |
IRQ affinity update
1. Move scripts to vyatta-cfg-system
2. Use syntax to check for legal values
3. Don't fail on UP
Diffstat (limited to 'scripts/system')
-rwxr-xr-x | scripts/system/vyatta-auto-irqaffin.pl | 441 | ||||
-rwxr-xr-x | scripts/system/vyatta-irqaffin | 198 |
2 files changed, 639 insertions, 0 deletions
diff --git a/scripts/system/vyatta-auto-irqaffin.pl b/scripts/system/vyatta-auto-irqaffin.pl new file mode 100755 index 00000000..be294439 --- /dev/null +++ b/scripts/system/vyatta-auto-irqaffin.pl @@ -0,0 +1,441 @@ +#!/usr/bin/perl +# +# Module: vyatta-auto-irqaffin.pl +# +# **** License **** +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# This code was originally developed by Vyatta, Inc. +# Portions created by Vyatta are Copyright (C) 2009,2010 Vyatta, Inc. +# All Rights Reserved. +# +# Author: Bob Gilligan (gilligan@vyatta.com) +# Date: October 2009 +# Description: Script to configure optimal IRQ affinity for NICs. +# +# **** End License **** +# + +# This script attempts to set up a static CPU affinity for the IRQs +# used by network interfaces. It is primarily targeted at supporting +# multi-queue NICs, but does include code to handle single-queue NICs. +# Since different NICs may have different queue organizations, and +# because there is no standard API for learning the mapping between +# queues and IRQ numbers, different code is required for each of the +# queue naming conventions. +# +# The general strategy involves trying to achieve the following goals: +# +# - Spread the receive load among as many CPUs as possible. +# +# - For all multi-queue NICs in the system that provide both tx and +# rx queues, keep all of the queues that share the same queue +# number on same CPUs. I.e. tx and rx queue 0 of all such NICs +# should interrupt one CPU; tx and rx queue 1 should interrupt a +# different CPU, etc. +# +# - If hyperthreading is supported and enabled, avoid assigning +# queues to both CPUs of a hyperthreaded pair if there are enough +# CPUs available to do that. +# +# This strategy yields the greatest MP scaling possible for +# multi-queue NICs. It also ensures that an individual skb is +# processed on the same CPU for the entirity of its lifecycle, +# including transmit time, which optimally utilizes the cache and +# keeps performance high. +# + + +use lib "/opt/vyatta/share/perl5"; +use Getopt::Long; + +use warnings; +use strict; + +# Send output of shell commands to syslog for debugging and so that +# the user is not confused by it. Log at debug level, which is supressed +# by default, so that we don't unnecessarily fill up the syslog file. +my $logger = 'logger -t firewall-cfg -p local0.debug --'; + +# Enable printing debug output to stdout. +my $debug_flag = 0; +my $syslog_flag = 0; + +my $setup_ifname; + +GetOptions("setup=s" => \$setup_ifname, + "debug" => \$debug_flag + ); + +sub log_msg { + my $message = shift; + + print "DEBUG: $message" if $debug_flag; + system("$logger DEBUG: \"$message\"") if $syslog_flag; +} + + +# Affinity assignment function for the Intel igb, ixgb and ixgbe +# drivers, and any other NICs that follow their queue naming +# convention. These NICs have an equal number of rx and tx queues. +# The first part of the strategy for optimal performance is to select +# the CPU to assign the IRQs to by mapping from the queue number. +# This ensures that all queues with the same queue number are assigned +# to the same CPU. The second part is to avoid assigning any queues +# to the second CPU in a hyper-threaded pair, if posible. I.e., if +# CPU 0 and 1 are hyper-threaded pairs, then assign a queue to CPU 0, +# but try to avoid assigning one to to CPU 1. But if we have more +# queues than CPUs, then it is OK to assign some to the second CPU in +# a hyperthreaded pair. +# +sub intel_func{ + my ($ifname, $numcpus, $numcores) = @_; + my $rx_queues; # number of rx queues + my $tx_queues; # number of tx queues + my $ht_factor; # 2 if HT enabled, 1 if not + + log_msg("intel_func was called.\n"); + + if ($numcpus > $numcores) { + $ht_factor = 2; + } else { + $ht_factor = 1; + } + + log_msg("ht_factor is $ht_factor.\n"); + + # Figure out how many queues we have + + $rx_queues=`grep "$ifname-rx-" /proc/interrupts | wc -l`; + $rx_queues =~ s/\n//; + + $tx_queues=`grep "$ifname-tx-" /proc/interrupts | wc -l`; + $tx_queues =~ s/\n//; + + log_msg("rx_queues is $rx_queues. tx_queues is $tx_queues\n"); + + if ($rx_queues != $tx_queues) { + printf("Error: rx and tx queues don't match for igb driver.\n"); + exit 1; + } + + # For i = 0 to number of queues: + # Affinity of rx and tx queue $i gets CPU ($i * (2 if HT, 1 if no HT)) + # % number_of_cpus + for (my $queue = 0, my $cpu = 0; ($queue < $rx_queues) ; $queue++) { + # Generate the hex string for the bitmask representing this CPU + my $cpu_bit = 1 << $cpu; + my $cpu_hex = sprintf("%x", $cpu_bit); + log_msg ("queue=$queue cpu=$cpu cpu_bit=$cpu_bit cpu_hex=$cpu_hex\n"); + + # Get the IRQ number for RX queue + my $rx_irq=`grep "$ifname-rx-$queue\$" /proc/interrupts | awk -F: '{print \$1}'`; + $rx_irq =~ s/\n//; + $rx_irq =~ s/ //g; + + # Get the IRQ number for TX queue + my $tx_irq=`grep "$ifname-tx-$queue\$" /proc/interrupts | awk -F: '{print \$1}'`; + $tx_irq =~ s/\n//; + $tx_irq =~ s/ //g; + + log_msg("rx_irq = $rx_irq. tx_irq = $tx_irq\n"); + + # Assign CPU affinity for both IRQs + system "echo $cpu_hex > /proc/irq/$rx_irq/smp_affinity"; + system "echo $cpu_hex > /proc/irq/$tx_irq/smp_affinity"; + + $cpu += $ht_factor; + + if ($cpu >= $numcpus) { + # Must "wrap" + $cpu %= $numcpus; + + if ($ht_factor > 1) { + # Next time through, select the other CPU in a hyperthreaded + # pair. + if ($cpu == 0) { + $cpu++; + } else { + $cpu--; + } + } + } + } +}; + + +# Affinity setting function for NICs using new intel queue scheme +# that provides one IRQ for each pair of TX and RX queues +sub intel_new_func{ + my ($ifname, $numcpus, $numcores) = @_; + my $txrx_queues; # number of rx/rx queue pairs + my $ht_factor; # 2 if HT enabled, 1 if not + + log_msg("intel_new_func was called.\n"); + + if ($numcpus > $numcores) { + $ht_factor = 2; + } else { + $ht_factor = 1; + } + + log_msg("ht_factor is $ht_factor.\n"); + + # Figure out how many queues we have + + $txrx_queues=`grep "$ifname-TxRx-" /proc/interrupts | wc -l`; + $txrx_queues =~ s/\n//; + + log_msg("txrx_queues is $txrx_queues.\n"); + + if ($txrx_queues <= 0) { + printf("Error: No TxRx queues found for new intel driver.\n"); + exit 1; + } + + # For i = 0 to number of queues: + # Affinity of TX/RX queue $i gets CPU ($i * (2 if HT, 1 if no HT)) + # % number_of_cpus + for (my $queue = 0, my $cpu = 0; ($queue < $txrx_queues) ; $queue++) { + # Generate the hex string for the bitmask representing this CPU + my $cpu_bit = 1 << $cpu; + my $cpu_hex = sprintf("%x", $cpu_bit); + log_msg ("queue=$queue cpu=$cpu cpu_bit=$cpu_bit cpu_hex=$cpu_hex\n"); + + # Get the IRQ number for RX queue + my $txrx_irq=`grep "$ifname-TxRx-$queue\$" /proc/interrupts | awk -F: '{print \$1}'`; + $txrx_irq =~ s/\n//; + $txrx_irq =~ s/ //g; + + log_msg("txrx_irq = $txrx_irq.\n"); + + # Assign CPU affinity for this IRQs + system "echo $cpu_hex > /proc/irq/$txrx_irq/smp_affinity"; + + $cpu += $ht_factor; + + if ($cpu >= $numcpus) { + # Must "wrap" + $cpu %= $numcpus; + + if ($ht_factor > 1) { + # Next time through, select the other CPU in a hyperthreaded + # pair. + if ($cpu == 0) { + $cpu++; + } else { + $cpu--; + } + } + } + } +}; + + +# Affinity assignment function for Broadcom NICs using the bnx2 driver +# or other multi-queue NICs that follow their queue naming convention. +# This strategy is similar to that for Intel drivers. But since +# Broadcom NICs do not have separate receive and transmit queues we +# perform one affinity assignment per queue. +# +sub broadcom_func{ + my ($ifname, $numcpus, $numcores) = @_; + my $num_queues; # number of queues + my $ht_factor; # 2 if HT enabled, 1 if not + + log_msg("broadcom_func was called.\n"); + + # Figure out how many queues we have + $num_queues=`egrep "$ifname\[-.\]\{1\}" /proc/interrupts | wc -l`; + $num_queues =~ s/\n//; + + log_msg("num_queues=$num_queues\n"); + + if ($num_queues <=0) { + printf("ERROR: No queues found for $ifname\n"); + exit 1; + } + + if ($numcpus > $numcores) { + $ht_factor = 2; + } else { + $ht_factor = 1; + } + + log_msg("ht_factor is $ht_factor.\n"); + + for (my $queue = 0, my $cpu = 0; ($queue < $num_queues) ; $queue++) { + # Generate the hex string for the bitmask representing this CPU + my $cpu_bit = 1 << $cpu; + my $cpu_hex = sprintf("%x", $cpu_bit); + log_msg ("queue=$queue cpu=$cpu cpu_bit=$cpu_bit cpu_hex=$cpu_hex\n"); + + # Get the IRQ number for the queue + my $irq=`egrep "$ifname\[-.fp\]*$queue\$" /proc/interrupts | awk -F: '{print \$1}'`; + $irq =~ s/\n//; + $irq =~ s/ //g; + + log_msg("irq = $irq.\n"); + + # Assign CPU affinity for this IRQs + system "echo $cpu_hex > /proc/irq/$irq/smp_affinity"; + + $cpu += $ht_factor; + if ($cpu >= $numcpus) { + # Must "wrap" + $cpu %= $numcpus; + + if ($ht_factor > 1) { + # Next time through, select the other CPU in a hyperthreaded + # pair. + if ($cpu == 0) { + $cpu++; + } else { + $cpu--; + } + } + } + } +} + + +# Affinity assignment function for single-quque NICs. The strategy +# here is to just spread the interrupts of different NICs evenly +# across all CPUs. That is the best we can do without monitoring the +# load and traffic patterns. So we just directly map the NIC unit +# number into a CPU number. +# +sub single_func { + my ($ifname, $numcpus, $numcores) = @_; + my $cpu; + use integer; + + log_msg("single_func was calledn.\n"); + + $ifname =~ m/^eth(.*)$/; + + my $ifunit = $1; + log_msg ("ifunit = $ifunit\n"); + + # Get the IRQ number for the queue + my $irq=`grep "$ifname" /proc/interrupts | awk -F: '{print \$1}'`; + $irq =~ s/\n//; + $irq =~ s/ //g; + + log_msg("irq = $irq.\n"); + + # Figure out what CPU to assign it to + if ($numcpus > $numcores) { + # Hyperthreaded + $cpu = (2 * $ifunit) % $numcpus; + + # every other time it wraps, add one to use the hyper-thread pair + # of the CPU selected. + my $use_ht = ((2 * $ifunit) / $numcpus) % 2; + $cpu += $use_ht; + } else { + # Not hyperthreaded. Map it to unit number MOD number of linux CPUs. + $cpu = $ifunit % $numcpus; + } + + # Generate the hex string for the bitmask representing this CPU + my $cpu_bit = 1 << $cpu; + my $cpu_hex = sprintf("%x", $cpu_bit); + log_msg ("cpu=$cpu cpu_bit=$cpu_bit cpu_hex=$cpu_hex\n"); + + # Assign CPU affinity for this IRQs + system "echo $cpu_hex > /proc/irq/$irq/smp_affinity"; +} + +# Mapping from driver type to function that handles it. +my %driver_hash = ( 'intel' => \&intel_func, + 'intel_new' => \&intel_new_func, + 'broadcom' => \&broadcom_func, + 'single' => \&single_func); + +if (defined $setup_ifname) { + # Set up automatic IRQ affinity for the named interface + + log_msg("setup $setup_ifname\n"); + + my $ifname = $setup_ifname; # shorter variable name + my $drivername; # Name of the NIC driver, e.g. "igb". + my $numcpus; # Number of Linux "cpus" + my $numcores; # Number of unique CPU cores + my $driver_func; # Pointer to fuction specific to a driver + my $driver_style; # Style of the driver. Whether it is multi-queue + # or not, and if it is, how it names its queues. + + # Determine how many CPUs the machine has. + $numcpus=`grep "^processor" /proc/cpuinfo | wc -l`; + $numcpus =~ s/\n//; + + log_msg("numcpus is $numcpus\n"); + + if ($numcpus == 1) { + # Nothing to do if we only have one CPU, so just exit quietly. + exit 0; + } + + # Determine how many cores the machine has. Could be less than + # the number of CPUs if processor supports hyperthreading. + $numcores=`grep "^core id" /proc/cpuinfo | uniq | wc -l`; + $numcores =~ s/\n//; + + log_msg("numcores is $numcores.\n"); + + # Verify that interface exists + if (! (-e "/proc/sys/net/ipv4/conf/$ifname")) { + printf("Error: Interface $ifname does not exist\n"); + exit 1; + } + + # Figure out what style of driver this NIC is using. + my $numints=`grep $ifname /proc/interrupts | wc -l`; + $numints =~ s/\n//; + if ($numints > 1) { + # It is a multiqueue NIC. Now figure out which one. + my $rx_queues=`grep "$ifname-rx-" /proc/interrupts | wc -l`; + $rx_queues =~ s/\n//; + if ($rx_queues > 0) { + # Driver is following the original Intel queue naming style + $driver_style="intel"; + } else { + my $rx_queues=`grep "$ifname-TxRx-" /proc/interrupts | wc -l`; + if ($rx_queues > 0) { + # Driver is following the new Intel queue naming + # style where on IRQ is used for each pair of + # TX and RX queues + $driver_style="intel_new"; + } else { + # The only other queue naming style that we have seen is the + # one used by Broadcom NICs. + $driver_style="broadcom"; + } + } + } elsif ($numints == 1) { + # It is a single queue NIC. + $driver_style="single"; + } else { + # $numints must be 0 + printf("Unable to determine IRQs for interface $ifname.\n"); + exit 0; + } + $driver_func = $driver_hash{$driver_style}; + + &$driver_func($ifname, $numcpus, $numcores); + + exit 0; +} + +printf("Must specify options.\n"); +exit(1); + + diff --git a/scripts/system/vyatta-irqaffin b/scripts/system/vyatta-irqaffin new file mode 100755 index 00000000..6fa0e086 --- /dev/null +++ b/scripts/system/vyatta-irqaffin @@ -0,0 +1,198 @@ +#!/bin/bash + +# Author: Robert E. Gilligan <gilligan@vyatta.com> +# Date: 2008 +# Description: CLI back-end script to manipulate NIC interrupt CPU affinity. + +# **** License **** +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# This code was originally developed by Vyatta, Inc. +# Portions created by Vyatta are Copyright (C) 2006, 2007, 2008 Vyatta, Inc. +# All Rights Reserved. +# **** End License **** + +# Provides sub-commands to: +# - Check the validity of an affinity mask value +# - Set the affinity mask to the IRQs being used by an interface +# - Reset the affinity mask of the IRQs being used by an interface to the +# system default value of all-ones. +# - Print the affinity mask of the IRQs being used by an interface +# +# If the NIC in question supports multiple IRQs, the "set" sub-command +# sets all IRQs to the same mask. The "print" sub-command displays +# the mask of each IRQ individually. +# + +# Max number of hex characters in an IRQ affinity mask. Support up to 64 CPUs. +MAX_MASK=16 + +# Set up some global values... +numcpus=`grep -c -e "^processor" /proc/cpuinfo` +declare -i maxmask=(2**numcpus) +let maxmask=maxmask-1 +maxmaskhex=`printf "%x" ${maxmask}` + +print_usage() +{ + echo "Usage:" + echo -e "\t$0 check <ifname> <mask>" + echo -e "\t$0 set <ifname> <mask>" + echo -e "\t$0 reset <ifname>" + echo -e "\t$0 print <ifname>" +} + +get_irqnums() +{ + irqnums=`grep $1 /proc/interrupts | awk -F ': ' '{ print $1 }'` + if [ -z "$irqnums" ]; then + echo "Unable to determine IRQs for interface $1" + return 1 + fi + return 0 +} + + +get_mask() +{ + mask=$1 + + # mask must be a short hex value + if [ ${#mask} -gt $MAX_MASK ]; then + echo "mask too long: ${#2} characters." + return 1 + fi + + # strip out all the hex digits + exmask=`echo $mask | sed -e s/[0-9a-fA-F]//g` + + # if anything is left, its not hex + if [ ! -z "$exmask" ]; then + echo "Invalid characters in hex mask: $exmask" + return 1 + fi + + declare -i intmask=0x${mask} + + # Make sure that mask holds at least one bit, and holds no more bits + # than we have CPUs. + + if [ ${intmask} -eq 0 ]; then + echo "Mask can not be 0." + return 1 + fi + + if [ $intmask -gt $maxmask ]; then + echo "Mask is too large. Maximum hexidecimal bitmask is: ${maxmaskhex}" + return 1 + fi + + return 0 +} + + +case "$1" in + check) + # Note: We don't validate the interface name even though + # it is available as a command argument. That is because + # the interface may not exist or may not be configured at + # the time the check is performed. + # + if [ $# -ne 3 ]; then + print_usage + exit 1 + fi + + if ! get_mask $3 ; then + exit 1 + fi + exit 0 + ;; + + set) + if [ $# -ne 3 ]; then + print_usage + exit 1 + fi + + if ! check_uniproc ; then + exit 1 + fi + + if ! get_irqnums $2 ; then + exit 1 + fi + + if ! get_mask $3 ; then + exit 1 + fi + + for irqnum in $irqnums ; do + echo $mask > /proc/irq/$irqnum/smp_affinity + done + + if [ $? -ne 0 ]; then + echo "Couldn't assign smp_affinity. Exit status: $?" + exit 1 + fi + ;; + + reset) + if [ $# -ne 2 ]; then + print_usage + exit 1 + fi + if ! get_irqnums $2 ; then + exit 1 + fi + + if [ -e /proc/irq/default_smp_affinity ]; then + defmask=`cat /proc/irq/default_smp_affinity` + else + defmask=$maxmaskhex + fi + + for irqnum in $irqnums ; do + echo $defmask > /proc/irq/$irqnum/smp_affinity + if [ $? -ne 0 ]; then + echo "Couldn't assign smp_affinity for IRQ $irqnum. Exit status: $?" + exit 1 + fi + done + ;; + + + print) + if [ $# -ne 2 ]; then + print_usage + exit 1 + fi + if ! get_irqnums $2 ; then + exit 1 + fi + + for irqnum in $irqnums ; do + mask=`cat /proc/irq/$irqnum/smp_affinity` + + if [ -z $mask ]; then + echo "Couldn't get smp_affinity for interface $2, irq $irqnum" + exit 1 + fi + + echo "Interface: $2 IRQ: $irqnum Mask: $mask" + done + ;; + + *) + print_usage + exit 1 + ;; + +esac |