File: //bin/google_set_multiqueue
#!/bin/bash
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# For a single-queue / no MSI-X virtionet device, sets the IRQ affinities to
# processor 0. For this virtionet configuration, distributing IRQs to all
# processors results in comparatively high cpu utilization and comparatively
# low network bandwidth.
#
# For a multi-queue / MSI-X virtionet device, sets the IRQ affinities to the
# per-IRQ affinity hint. The virtionet driver maps each virtionet TX (RX) queue
# MSI-X interrupt to a unique single CPU if the number of TX (RX) queues equals
# the number of online CPUs. The mapping of network MSI-X interrupt vector to
# CPUs is stored in the virtionet MSI-X interrupt vector affinity hint. This
# configuration allows network traffic to be spread across the CPUs, giving
# each CPU a dedicated TX and RX network queue, while ensuring that all packets
# from a single flow are delivered to the same CPU.
#
# For a gvnic device, set the IRQ affinities to the per-IRQ affinity hint.
# The google virtual ethernet driver maps each queue MSI-X interrupt to a
# unique single CPU, which is stored in the affinity_hint for each MSI-X
# vector. In older versions of the kernel, irqblanace is expected to copy the
# affinity_hint to smp_affinity; however, GCE instances disable irqbalance by
# default. This script copies over the affinity_hint to smp_affinity on boot to
# replicate the behavior of irqbalance.
function is_decimal_int() {
[ "${1}" -eq "${1}" ] > /dev/null 2>&1
}
function set_channels() {
ethtool -L "${1}" combined "${2}" > /dev/null 2>&1
}
function set_irq_range() {
local -r nic="$1"
local bind_cores_begin="$2"
# The user may not have this $nic configured on their VM, if not, just skip
# it, no need to error out.
if [ ! -d "/sys/class/net/"$nic"/device" ]; then
return;
fi
# We count the number of rx queues and assume number of rx queues == tx
# queues. The number of queues shown in the sysfs stands for the initial
# queues while the number of IRQs stands for the max queues. The number of
# initial queues should be always less than or equal to that of the max
# queues.
core=$bind_cores_begin
num_irqs=$(( $(ls /sys/class/net/"$nic"/device/msi_irqs | wc -l) / 2 ))
num_q=$(ls -1 /sys/class/net/"$nic"/queues/ | grep rx | wc -l)
echo "Setting irq binding for "$nic" to core [$core - $((core + num_q - 1))] ..."
irqs=($(ls /sys/class/net/"$nic"/device/msi_irqs | sort -g))
for ((irq = 0; irq < "$num_irqs"; irq++)); do
tx_irq=${irqs[$irq]}
rx_irq=${irqs[$((irq + num_irqs))]}
# Only allocate $num_q cores to the IRQs and queues. If the number of IRQs
# is more than that of queues, the CPUs will be wrapped around.
core=$(( bind_cores_begin + irq % num_q ))
# this is GVE's TX irq. See gve_tx_idx_to_ntfy().
echo "$core" > /proc/irq/"$tx_irq"/smp_affinity_list
# this is GVE's RX irq. See gve_rx_idx_to_ntfy().
echo "$core" > /proc/irq/"$rx_irq"/smp_affinity_list
# Check if the queue exists at present because the number of IRQs equals
# the max number of queues allocated and could be greater than the current
# number of queues.
tx_queue=/sys/class/net/"$nic"/queues/tx-"$irq"
if ls $tx_queue 1> /dev/null 2>&1; then
# XPS (Transmit Packet Steering) allows a core to decide which queue to
# select if its mask is found in one of the queue's xps_cpus
cp /proc/irq/"$tx_irq"/smp_affinity $tx_queue/xps_cpus
echo -en "$nic:q-$irq: \ttx: irq $tx_irq bind to $core \trx: irq $rx_irq bind to $core"
echo -e " \txps_cpus bind to $(cat $tx_queue/xps_cpus)"
else
echo -e "$nic:q-$irq: \ttx: irq $tx_irq bind to $core \trx: irq $rx_irq bind to $core"
fi
done
}
# returns 0 (success) if the all the interfaces contains pnic_id on the Metadats server.
function contains_pnic_ids() {
while IFS= read -r interface; do
echo "Interface: $interface"
network_interfaces_mds_attributes=$(curl -m 1 -H "Metadata-Flavor: Google" \
"http://169.254.169.254/computeMetadata/v1/instance/network-interfaces/$interface/")
if ! echo "$network_interfaces_mds_attributes" | grep -q "physical-nic-id"; then
echo "physical-nic-id NOT found in interface $interface"
return 1
fi
done < <(curl -m 1 -H "Metadata-Flavor: Google" http://169.254.169.254/computeMetadata/v1/instance/network-interfaces/)
return 0
}
# returns 0 (success) if the platform is a multinic accelerator platform.
function is_multinic_accelerator_platform() {
contains_pnic_ids
CONTAINS_PNIC_IDS=$?
machine_type=$(curl -m 1 -H "Metadata-Flavor: Google" \
http://169.254.169.254/computeMetadata/v1/instance/machine-type)
[[ $CONTAINS_PNIC_IDS -eq 0 \
|| "$machine_type" == *"a3-highgpu-8g"* \
|| "$machine_type" == *"a3-ultragpu-8g"* \
|| "$machine_type" == *"a3-megagpu-8g"* \
|| "$machine_type" == *"a3-edgegpu-8g"* \
|| "$machine_type" == *"a3-ultragpu-"* \
|| "$machine_type" == *"a4-highgpu-"* \
|| "$machine_type" == *"a4x-highgpu-"* ]] || return 1
return 0
}
# returns 0 (success) if the supplied nic is a Gvnic device.
function is_gvnic() {
local -r nic_name="$1"
driver_type=$(ethtool -i $nic_name | grep driver)
[[ "$driver_type" == *"gve"*
|| "$driver_type" == *"gvnic"* ]] || return 1
return 0
}
# Returns the vCPU ranges on each of the numa nodes. The vCPU ranges will
# be in the form of array of
# [numa0_irq_start_1, numa0_irq_end_1, numa0_irq_start_2, numa0_irq_end_2,
# numa1_irq_start_1, numa1_irq_end_1, numa1_irq_start_2, numa1_irq_end_2]
# this will only return the vCPU ranges on NUMA0 and NUMA1 since accelerator
# platforms of GEN3 and after only have 2 NUMA nodes.
# The expected vCPU ranges on eahc platforms are:
# A3/A3-mega:
# numa0: [0, 51] [104, 155]
# numa1: [52, 103] [156, 207]
# A3-ultra:
# numa0: [0, 55] [113, 168]
# numa1: [56, 112] [169, 224]
function get_vcpu_ranges_on_accelerator_platform {
declare -n arr_ref=$1
# Get vCPU ranges for NUMA 0
numa0_irq_range=($(cat /sys/devices/system/node/node0/cpulist))
numa0_irq_range0="${numa0_irq_range[0]%,*}"
numa0_irq_range1="${numa0_irq_range[0]#*,}"
numa0_irq_range0_start=$(echo "$numa0_irq_range0" | cut -d '-' -f 1)
# Avoid setting binding IRQ on vCPU 0 as it is a busy vCPU being heavily
# used by the system.
numa0_irq_range0_start=$((numa0_irq_range0_start + 1))
numa0_irq_range0_end=$(echo "$numa0_irq_range0" | cut -d '-' -f 2)
numa0_irq_range1_start=$(echo "$numa0_irq_range1" | cut -d '-' -f 1)
numa0_irq_range1_end=$(echo "$numa0_irq_range1" | cut -d '-' -f 2)
# Get vCPU ranges for NUMA 1
numa1_irq_range=($(cat /sys/devices/system/node/node1/cpulist))
numa1_irq_range0="${numa1_irq_range[0]%,*}"
numa1_irq_range1="${numa1_irq_range[0]#*,}"
numa1_irq_range0_start=$(echo "$numa1_irq_range0" | cut -d '-' -f 1)
numa1_irq_range0_end=$(echo "$numa1_irq_range0" | cut -d '-' -f 2)
numa1_irq_range1_start=$(echo "$numa1_irq_range1" | cut -d '-' -f 1)
numa1_irq_range1_end=$(echo "$numa1_irq_range1" | cut -d '-' -f 2)
arr_ref=(
"$numa0_irq_range0_start"
"$numa0_irq_range0_end"
"$numa0_irq_range1_start"
"$numa0_irq_range1_end"
"$numa1_irq_range0_start"
"$numa1_irq_range0_end"
"$numa1_irq_range1_start"
"$numa1_irq_range1_end")
}
echo "Running $(basename $0)."
VIRTIO_NET_DEVS=/sys/bus/virtio/drivers/virtio_net/virtio*
is_multinic_accelerator_platform
IS_MULTINIC_ACCELERATOR_PLATFORM=$?
# Loop through all the virtionet devices and enable multi-queue
if [ -x "$(command -v ethtool)" ]; then
for dev in $VIRTIO_NET_DEVS; do
ETH_DEVS=${dev}/net/*
for eth_dev in $ETH_DEVS; do
eth_dev=$(basename "$eth_dev")
if ! errormsg=$(ethtool -l "$eth_dev" 2>&1); then
echo "ethtool says that $eth_dev does not support virtionet multiqueue: $errormsg."
continue
fi
num_max_channels=$(ethtool -l "$eth_dev" | grep -m 1 Combined | cut -f2)
if [[ -n "${num_max_channels}" || "${num_max_channels}" -eq "1" ]]; then
echo "num_max_channels is n/a, skipping set channels for $eth_dev"
continue
fi
if is_decimal_int "$num_max_channels" && \
set_channels "$eth_dev" "$num_max_channels"; then
echo "Set channels for $eth_dev to $num_max_channels."
else
echo "Could not set channels for $eth_dev to $num_max_channels."
fi
done
done
else
echo "ethtool not found: cannot configure virtionet multiqueue."
fi
for dev in $VIRTIO_NET_DEVS
do
dev=$(basename "$dev")
irq_dir=/proc/irq/*
for irq in $irq_dir
do
smp_affinity="${irq}/smp_affinity_list"
[ ! -f "${smp_affinity}" ] && continue
# Classify this IRQ as virtionet intx, virtionet MSI-X, or non-virtionet
# If the IRQ type is virtionet intx, a subdirectory with the same name as
# the device will be present. If the IRQ type is virtionet MSI-X, then
# a subdirectory of the form <device name>-<input|output>.N will exist.
# In this case, N is the input (output) queue number, and is specified as
# a decimal integer ranging from 0 to K - 1 where K is the number of
# input (output) queues in the virtionet device.
virtionet_intx_dir="${irq}/${dev}"
virtionet_msix_dir_regex=".*/${dev}-(input|output)\.([0-9]+)$"
if [ -d "${virtionet_intx_dir}" ]; then
# All virtionet intx IRQs are delivered to CPU 0
echo "Setting ${smp_affinity} to 01 for device ${dev}."
echo "01" > "${smp_affinity}"
continue
fi
# Not virtionet intx, probe for MSI-X
virtionet_msix_found=0
for entry in ${irq}/${dev}*; do
if [[ "$entry" =~ ${virtionet_msix_dir_regex} ]]; then
virtionet_msix_found=1
queue_num=${BASH_REMATCH[2]}
fi
done
affinity_hint="${irq}/affinity_hint"
[ "$virtionet_msix_found" -eq 0 -o ! -f "${affinity_hint}" ] && continue
# Set the IRQ CPU affinity to the virtionet-initialized affinity hint
echo "Setting ${smp_affinity} to ${queue_num} for device ${dev}."
echo "${queue_num}" > "${smp_affinity}"
real_affinity=`cat ${smp_affinity}`
echo "${smp_affinity}: real affinity ${real_affinity}"
done
done
# Set smp_affinity properly for gvnic queues. '-ntfy-block.' is unique to gve
# and will not affect virtio queues.
for i in /proc/irq/*; do
if ls ${i}/*-ntfy-block.* 1> /dev/null 2>&1; then
if [ -f ${i}/affinity_hint ]; then
echo Setting smp_affinity on ${i} to $(cat ${i}/affinity_hint)
cp ${i}/affinity_hint ${i}/smp_affinity
fi
fi
done
XPS=/sys/class/net/e*/queues/tx*/xps_cpus
num_cpus=$(nproc)
[[ $num_cpus -gt 63 ]] && num_cpus=63
num_queues=0
for q in $XPS; do
interface=$(echo "$q" | grep -oP 'net/\K[^/]+')
if [[ $IS_MULTINIC_ACCELERATOR_PLATFORM == 0 ]] && ! $(is_gvnic "$interface"); then
continue
fi
num_queues=$((num_queues + 1))
done
# If we have more CPUs than queues, then stripe CPUs across tx affinity
# as CPUNumber % queue_count.
for q in $XPS; do
interface=$(echo "$q" | grep -oP 'net/\K[^/]+')
if [[ $IS_MULTINIC_ACCELERATOR_PLATFORM == 0 ]] && ! $(is_gvnic "$interface"); then
continue
fi
queue_re=".*tx-([0-9]+).*$"
if [[ "$q" =~ ${queue_re} ]]; then
queue_num=${BASH_REMATCH[1]}
fi
xps=0
for cpu in `seq $queue_num $num_queues $((num_cpus - 1))`; do
xps=$((xps | (1 << cpu)))
done
# Linux xps_cpus requires a hex number with commas every 32 bits. It ignores
# all bits above # cpus, so write a list of comma separated 32 bit hex values
# with a comma between dwords.
xps_dwords=()
for i in $(seq 0 $(((num_cpus - 1) / 32)))
do
xps_dwords=(`printf "%08x" $((xps & 0xffffffff))` "${xps_dwords[@]}")
xps=$((xps >> 32))
done
xps_string=$(IFS=, ; echo "${xps_dwords[*]}")
echo ${xps_string} > $q
printf "Queue %d XPS=%s for %s\n" $queue_num `cat $q` $q
done | sort -n -k2
if [[ ! $IS_MULTINIC_ACCELERATOR_PLATFORM == 0 ]]; then
exit
fi
# Assign IRQ binding for network interfaces based on pci bus ordering.
#
# Below logics explains how we rank interfaces by pci bus order.
# > find /sys/class/net -type l | xargs -L 1 realpath | sort
# /sys/devices/pci0000:00/0000:00:0b.0/net/enp0s11
# /sys/devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:02.0/0000:06:00.0/net/enp6s0
# /sys/devices/pci0000:07/0000:07:00.0/0000:08:00.0/0000:09:02.0/0000:0c:00.0/net/enp12s0
# /sys/devices/pci0000:81/0000:81:00.0/0000:82:00.0/0000:83:02.0/0000:86:00.0/net/enp134s0
# /sys/devices/pci0000:87/0000:87:00.0/0000:88:00.0/0000:89:02.0/0000:8c:00.0/net/enp140s0
# /sys/devices/virtual/net/lo
#
# > find /sys/class/net -type l | xargs -L 1 realpath | sort | xargs -L 1 basename | grep -v lo
# enp0s11
# enp6s0
# enp12s0
# enp134s0
# enp140s0
irq_ranges=()
get_vcpu_ranges_on_accelerator_platform irq_ranges
echo "Binding vCPUs on NUMA0 [${irq_ranges[0]} ${irq_ranges[1]}], [${irq_ranges[2]} ${irq_ranges[3]}]}"
echo "Binding vCPUs on NUMA1 [${irq_ranges[4]} ${irq_ranges[5]}], [${irq_ranges[6]} ${irq_ranges[7]}]}"
numa0_irq_start=${irq_ranges[0]}
find /sys/class/net -type l | xargs -L 1 realpath | grep '/sys/devices/pci' | sort | xargs -L 1 basename | while read nic_name; do
# For non-gvnic devices (e.g. mlx5), the IRQ bindings will be handled by the device's driver.
if ! is_gvnic "$nic_name"; then
echo "$nic_name is not gvnic device, skipping set irq on this device"
continue
fi
echo "$nic_name is Gvnic device, continuing set IRQ on $nic_name ."
nic_numa_node=$(cat /sys/class/net/"$nic_name"/device/numa_node)
if [[ $nic_numa_node -ne 0 ]]; then
continue
fi
nic_num_queues=$(ls -1 /sys/class/net/"$nic_name"/queues/ | grep rx | wc -l)
bind_cores_begin=$numa0_irq_start
bind_cores_end=$((bind_cores_begin + nic_num_queues))
if [[ $bind_cores_begin -lt ${irq_ranges[1]} ]] && [[ $bind_cores_end -gt ${irq_ranges[1]} ]]; then
bind_cores_begin=${irq_ranges[2]}
bind_cores_end=$((bind_cores_begin + nic_num_queues))
fi
set_irq_range "$nic_name" "$bind_cores_begin"
numa0_irq_start=$bind_cores_end
done
numa1_irq_start=${irq_ranges[4]}
find /sys/class/net -type l | xargs -L 1 realpath | grep '/sys/devices/pci' | sort | xargs -L 1 basename | while read nic_name; do
# For non-gvnic devices (e.g. mlx5), the IRQ bindings will be handled by the device's driver.
if ! is_gvnic "$nic_name"; then
echo "$nic_name is not gvnic device, skipping set irq on this device"
continue
fi
echo "$nic_name is Gvnic device, continuing set IRQ on $nic_name ."
nic_numa_node=$(cat /sys/class/net/"$nic_name"/device/numa_node)
if [[ $nic_numa_node -ne 1 ]]; then
continue
fi
nic_num_queues=$(ls -1 /sys/class/net/"$nic_name"/queues/ | grep rx | wc -l)
bind_cores_begin=$numa1_irq_start
bind_cores_end=$((bind_cores_begin + nic_num_queues))
if [[ $bind_cores_begin -lt ${irq_ranges[5]} ]] && [[ $bind_cores_end -gt ${irq_ranges[5]} ]]; then
bind_cores_begin=${irq_ranges[6]}
bind_cores_end=$((bind_cores_begin + nic_num_queues))
fi
set_irq_range "$nic_name" "$bind_cores_begin"
numa1_irq_start=$bind_cores_end
done