From 28e86f46e504eff3ab087aa179a969b1e3383a07 Mon Sep 17 00:00:00 2001 From: Bob Gilligan Date: Mon, 26 Oct 2009 17:06:48 -0700 Subject: Generalize handling of multi-queue NICs. Previously the script determined the driver that a NIC was using, then used that to parse the queue names from /proc/interrupts. Now, it determines the naming convention just by looking at the queue names. Now the script should work for all NICs that support one of the two standard naming conventions. Also added some support for single-queue NICs. --- scripts/vyatta-auto-irqaffin.pl | 172 +++++++++++++++++++++++++++++----------- 1 file changed, 124 insertions(+), 48 deletions(-) diff --git a/scripts/vyatta-auto-irqaffin.pl b/scripts/vyatta-auto-irqaffin.pl index 521a181..fa58420 100644 --- a/scripts/vyatta-auto-irqaffin.pl +++ b/scripts/vyatta-auto-irqaffin.pl @@ -23,23 +23,34 @@ # **** End License **** # -# This script attempts to perform a static affinity assignment for network -# interfaces. It is primarily targeted at supporting multi-queue NICs. +# This script attempts to set up a static CPU affinity for the IRQs +# used by network interfaces. It is primarily targeted at supporting +# multi-queue NICs, but does include code to handle single-queue NICs. # Since different NICs may have different queue organizations, and # because there is no standard API for learning the mapping between -# queues and IRQ numbers, different code is required for each driver. +# queues and IRQ numbers, different code is required for each of the +# queue naming conventions. +# +# The general strategy involves trying to achieve the following goals: # -# The general strategy includes: # - Spread the receive load among as many CPUs as possible. -# - For NICs that provide both rx and tx queue, keep the tx queue -# on the same CPU as the corresponding rx queue. -# - For all multi-queue NICs in the system, the same tx and rx queue -# numbers should interrupt the same CPUs. I.e. tx and rx queue 0 -# of all NICs should interrupt the same CPU. +# +# - For all multi-queue NICs in the system that provide both tx and +# rx queues, keep all of the queues that share the same queue +# number on same CPUs. I.e. tx and rx queue 0 of all such NICs +# should interrupt one CPU; tx and rx queue 1 should interrupt a +# different CPU, etc. +# # - If hyperthreading is supported and enabled, avoid assigning # queues to both CPUs of a hyperthreaded pair if there are enough # CPUs available to do that. # +# This strategy yields the greatest MP scaling possible for +# multi-queue NICs. It also ensures that an individual skb is +# processed on the same CPU for the entirity of its lifecycle, +# including transmit time, which optimally utilizes the cache and +# keeps performance high. +# use lib "/opt/vyatta/share/perl5"; @@ -71,24 +82,26 @@ sub log_msg { } -# Affinity strategy function for the igb driver. NICs using this -# driver have an equal number of rx and tx queues. The first part of -# the strategy for optimal performance is to assign irq of each queue -# in a pair of tx and rx queues that have the same queue number to the -# same CPU. I.e., assign queue 0 to CPU X, queue 1 to CPU Y, etc. -# The second part is to avoid assigning any queues to the second CPU -# in a hyper-threaded pair, if posible. I.e., if CPU 0 and 1 are -# hyper-threaded pairs, then assign a queue to CPU 0, but try to avoid -# assigning one to to CPU 1. But if we have more queues than CPUs, then -# it is OK to assign some to the second CPU in a hyperthreaded pair. +# Affinity assignment function for the Intel igb, ixgb and ixgbe +# drivers, and any other NICs that follow their queue naming +# convention. These NICs have an equal number of rx and tx queues. +# The first part of the strategy for optimal performance is to select +# the CPU to assign the IRQs to by mapping from the queue number. +# This ensures that all queues with the same queue number are assigned +# to the same CPU. The second part is to avoid assigning any queues +# to the second CPU in a hyper-threaded pair, if posible. I.e., if +# CPU 0 and 1 are hyper-threaded pairs, then assign a queue to CPU 0, +# but try to avoid assigning one to to CPU 1. But if we have more +# queues than CPUs, then it is OK to assign some to the second CPU in +# a hyperthreaded pair. # -sub igb_func{ +sub intel_func{ my ($ifname, $numcpus, $numcores) = @_; my $rx_queues; # number of rx queues my $tx_queues; # number of tx queues my $ht_factor; # 2 if HT enabled, 1 if not - log_msg("igb_func was called.\n"); + log_msg("intel_func was called.\n"); if ($numcpus > $numcores) { $ht_factor = 2; @@ -157,14 +170,18 @@ sub igb_func{ } }; -# Similar strategy as for igb driver, but Broadcom NICs do not have -# separate receive and transmit queues. -sub bnx2_func{ +# Affinity assignment function for Broadcom NICs using the bnx2 driver +# or other multi-queue NICs that follow their queue naming convention. +# This strategy is similar to that for Intel drivers. But since +# Broadcom NICs do not have separate receive and transmit queues we +# perform one affinity assignment per queue. +# +sub broadcom_func{ my ($ifname, $numcpus, $numcores) = @_; my $num_queues; # number of queues my $ht_factor; # 2 if HT enabled, 1 if not - log_msg("bnx2_func was called.\n"); + log_msg("broadcom_func was called.\n"); # Figure out how many queues we have $num_queues=`grep "$ifname-" /proc/interrupts | wc -l`; @@ -219,9 +236,59 @@ sub bnx2_func{ } } -my %driver_hash = ( 'igb' => \&igb_func, - 'ixbg' => \&igb_func, - 'bnx2' =>\&bnx2_func ); + +# Affinity assignment function for single-quque NICs. The strategy +# here is to just spread the interrupts of different NICs evenly +# across all CPUs. That is the best we can do without monitoring the +# load and traffic patterns. So we just directly map the NIC unit +# number into a CPU number. +# +sub single_func { + my ($ifname, $numcpus, $numcores) = @_; + my $cpu; + use integer; + + log_msg("single_func was calledn.\n"); + + $ifname =~ m/^eth(.*)$/; + + my $ifunit = $1; + log_msg ("ifunit = $ifunit\n"); + + # Get the IRQ number for the queue + my $irq=`grep "$ifname" /proc/interrupts | awk -F: '{print \$1}'`; + $irq =~ s/\n//; + $irq =~ s/ //g; + + log_msg("irq = $irq.\n"); + + # Figure out what CPU to assign it to + if ($numcpus > $numcores) { + # Hyperthreaded + $cpu = (2 * $ifunit) % $numcpus; + + # every other time it wraps, add one to use the hyper-thread pair + # of the CPU selected. + my $use_ht = ((2 * $ifunit) / $numcpus) % 2; + $cpu += $use_ht; + } else { + # Not hyperthreaded. Map it to unit number MOD number of linux CPUs. + $cpu = $ifunit % $numcpus; + } + + # Generate the hex string for the bitmask representing this CPU + my $cpu_bit = 1 << $cpu; + my $cpu_hex = sprintf("%x", $cpu_bit); + log_msg ("cpu=$cpu cpu_bit=$cpu_bit cpu_hex=$cpu_hex\n"); + + # Assign CPU affinity for this IRQs + system "echo $cpu_hex > /proc/irq/$irq/smp_affinity"; +} + +# Mapping from driver type to function that handles it. +my %driver_hash = ( 'intel' => \&intel_func, + 'broadcom' => \&broadcom_func, + 'single' => \&single_func); if (defined $setup_ifname) { # Set up automatic IRQ affinity for the named interface @@ -233,8 +300,10 @@ if (defined $setup_ifname) { my $numcpus; # Number of Linux "cpus" my $numcores; # Number of unique CPU cores my $driver_func; # Pointer to fuction specific to a driver + my $driver_style; # Style of the driver. Whether it is multi-queue + # or not, and if it is, how it names its queues. - # Determine how many CPUs the machine has + # Determine how many CPUs the machine has. $numcpus=`grep "^processor" /proc/cpuinfo | wc -l`; $numcpus =~ s/\n//; @@ -245,32 +314,39 @@ if (defined $setup_ifname) { exit 0; } + # Determine how many cores the machine has. Could be less than + # the number of CPUs if processor supports hyperthreading. + $numcores=`grep "^core id" /proc/cpuinfo | uniq | wc -l`; + $numcores =~ s/\n//; + + log_msg("numcores is $numcores.\n"); + # Verify that interface exists if (! (-e "/proc/sys/net/ipv4/conf/$ifname")) { printf("Error: Interface $ifname does not exist\n"); exit 1; } - # Figure out what driver this NIC is using. - $drivername=`ethtool -i $ifname | grep "^driver" | awk '{print \$2}'`; - $drivername =~ s/\n//; - - log_msg("drivername is $drivername\n"); - - $driver_func = $driver_hash{$drivername}; - - # We only support a couple of drivers at this time, so just exit - # if its not one we support. - if (! defined($driver_func)) { - printf("Automatic SMP affinity not supported for NICs using the $drivername driver.\n"); - exit 0; # not an error + # Figure out what style of driver this NIC is using. + my $numints=`grep $ifname /proc/interrupts | wc -l`; + $numints =~ s/\n//; + if ($numints > 1) { + # It is a multiqueue NIC. Now figure out which one. + my $rx_queues=`grep "$ifname-rx-" /proc/interrupts | wc -l`; + $rx_queues =~ s/\n//; + if ($rx_queues > 0) { + # Driver is following the Intel queue naming style + $driver_style="intel"; + } else { + # The only other queue naming style that we have seen is the + # one used by Broadcom NICs. + $driver_style="broadcom"; + } + } else { + # It is a single queue NIC. + $driver_style="single"; } - - # Determine whether machine has hyperthreading enabled - $numcores=`grep "^core id" /proc/cpuinfo | uniq | wc -l`; - $numcores =~ s/\n//; - - log_msg("numcores is $numcores.\n"); + $driver_func = $driver_hash{$driver_style}; &$driver_func($ifname, $numcpus, $numcores); -- cgit v1.2.3 From 47cd5485b45d40b29d96d09c3c133d7c32ada4c3 Mon Sep 17 00:00:00 2001 From: Bob Gilligan Date: Mon, 26 Oct 2009 17:09:31 -0700 Subject: 0.15.32 --- debian/changelog | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/debian/changelog b/debian/changelog index b119c4e..7ae7277 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,13 @@ +vyatta-cfg (0.15.32) unstable; urgency=low + + [ slioch ] + * dismantle old priority file parsing and node matching code. removed + + [ Bob Gilligan ] + * Generalize handling of multi-queue NICs. + + -- Bob Gilligan Mon, 26 Oct 2009 17:09:30 -0700 + vyatta-cfg (0.15.31) unstable; urgency=low [ slioch ] -- cgit v1.2.3 From c9a5284343b95baae37b13240a350da5f171b8dc Mon Sep 17 00:00:00 2001 From: slioch Date: Tue, 27 Oct 2009 15:33:11 -0700 Subject: removed out of date priority values from priority file --- Makefile.am | 1 + templates/priority | 106 ++--------------------------------------------------- 2 files changed, 4 insertions(+), 103 deletions(-) diff --git a/Makefile.am b/Makefile.am index 54ef9d6..cd8ce56 100644 --- a/Makefile.am +++ b/Makefile.am @@ -46,6 +46,7 @@ src_check_tmpl_SOURCES = src/check_tmpl.c sbin_SCRIPTS = scripts/vyatta-cfg-cmd-wrapper sbin_SCRIPTS += scripts/vyatta-validate-type.pl sbin_SCRIPTS += scripts/vyatta-find-type.pl +sbin_SCRIPTS += scripts/priority.pl sbin_SCRIPTS += scripts/vyatta-config-loader.pl sbin_SCRIPTS += scripts/vyatta-config-gen-sets.pl sbin_SCRIPTS += scripts/vyatta-cli-expand-var.pl diff --git a/templates/priority b/templates/priority index e32a5c0..2359178 100644 --- a/templates/priority +++ b/templates/priority @@ -92,107 +92,7 @@ # "active config" tree at the time the lower-level node is committed. # -200 firewall/group/address-group -200 firewall/group/network-group -200 firewall/group/port-group -210 firewall/name/node.tag -210 firewall/modify/node.tag -210 firewall/ipv6-name/node.tag -210 firewall/ipv6-modify/node.tag -215 firewall -310 interfaces/bridge -315 interfaces/bonding -318 interfaces/ethernet -319 interfaces/ethernet/node.tag/vif -319 interfaces/ethernet/node.tag/bond-group -320 interfaces/ethernet/node.tag/vif/node.tag/bridge-group -320 interfaces/bonding/node.tag/bridge-group -320 interfaces/bonding/node.tag/vif -320 interfaces/bridge/node.tag/address -320 interfaces/loopback -330 interfaces/adsl -340 interfaces/serial -350 interfaces/wirelessmodem -350 interfaces/wireless -380 interfaces/tunnel -380 interfaces/openvpn -390 interfaces/pseudo-ethernet -391 interfaces/pseudo-ethernet/node.tag/vif -400 system/domain-name -400 system/domain-search -400 system/gateway-address -400 system/host-name -400 system/ip -400 system/ipv6 -400 system/login -400 system/name-server -400 system/ntp-server -400 system/options -400 system/package -400 system/static-host-mapping -400 system/syslog -400 system/time-zone -405 system -450 protocols/static -470 policy -500 protocols/bgp/node.tag/parameters -510 protocols/bgp/node.tag/neighbor -520 protocols/bgp -610 protocols/ospf/parameters -620 protocols/ospf -630 protocols/ospfv3/parameters -640 protocols/ospfv3 -650 protocols/rip -660 protocols/ripng -800 interfaces/ethernet/node.tag/vrrp -800 interfaces/ethernet/node.tag/vif/node.tag/vrrp -810 interfaces/serial/node.tag/frame-relay/vif -810 interfaces/serial/node.tag/ppp -810 interfaces/serial/node.tag/ppp/vif -810 interfaces/serial/node.tag/cisco-hdlc/vif -850 interfaces -# Router advertisement daemon startup should take place after interfaces -# have been fully configured. We have a router-advert node under just about -# every interface type, hence the large number of priority nodes in this -# source file. They can be removed from this source file once bug 4903 -# is fixed -860 interfaces/ethernet/node.tag/ipv6/router-advert -860 interfaces/ethernet/node.tag/pppoe/node.tag/ipv6/router-advert -860 interfaces/ethernet/node.tag/vif/node.tag/ipv6/router-advert -860 interfaces/ethernet/node.tag/vif/node.tag/pppoe/node.tag/ipv6/router-advert -860 interfaces/bonding/node.tag/ipv6/router-advert -860 interfaces/bonding/node.tag/vif/node.tag/ipv6/router-advert -860 interfaces/tunnel/node.tag/ipv6/router-advert -860 interfaces/bridge/node.tag/ipv6/router-advert -860 interfaces/openvpn/node.tag/ipv6/router-advert -860 interfaces/wirelessmodem/node.tag/ipv6/router-advert -860 interfaces/multilink/node.tag/vif/node.tag/ipv6/router-advert -860 interfaces/adsl/node.tag/pvc/node.tag/bridged-ethernet/ipv6/router-advert -860 interfaces/adsl/node.tag/pvc/node.tag/classical-ipoa/ipv6/router-advert -860 interfaces/adsl/node.tag/pvc/node.tag/pppoa/node.tag/ipv6/router-advert -860 interfaces/adsl/node.tag/pvc/node.tag/pppoe/node.tag/ipv6/router-advert -860 interfaces/serial/node.tag/cisco-hdlc/vif/node.tag/ipv6/router-advert -860 interfaces/serial/node.tag/frame-relay/vif/node.tag/ipv6/router-advert -860 interfaces/serial/node.tag/ppp/vif/node.tag/ipv6/router-advert - -900 vpn -900 qos-policy -900 test-definition -900 content-inspection -900 load-balancing -900 protocols -900 service -910 service/dhcp-relay -911 service/dhcp-server -913 service/https -914 service/nat -915 service/ssh -916 service/telnet -917 service/webproxy -918 service/dns/forwarding -919 service/dns/dynamic -960 cluster -970 zone-policy/zone/node.tag/from -975 zone-policy -980 protocols/snmp +# +# RUN perl /opt/vyatta/sbin/priority.pl to generate the current priority listings +# \ No newline at end of file -- cgit v1.2.3 From 1f17200053dcf6fa9e02fe0b065f382dc78aed13 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 27 Oct 2009 17:13:26 -0700 Subject: 0.15.33 --- debian/changelog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/debian/changelog b/debian/changelog index 7ae7277..80bc7e3 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +vyatta-cfg (0.15.33) unstable; urgency=low + + [ slioch ] + * removed out of date priority values from priority file + + -- root Tue, 27 Oct 2009 17:13:25 -0700 + vyatta-cfg (0.15.32) unstable; urgency=low [ slioch ] -- cgit v1.2.3