HEX
Server: Apache/2.4.65 (Ubuntu)
System: Linux ielts-store-v2 6.8.0-1036-gcp #38~22.04.1-Ubuntu SMP Thu Aug 14 01:19:18 UTC 2025 x86_64
User: root (0)
PHP: 7.2.34-54+ubuntu20.04.1+deb.sury.org+1
Disabled: pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,
Upload Files
File: //bin/google_set_multiqueue
#!/bin/bash
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# For a single-queue / no MSI-X virtionet device, sets the IRQ affinities to
# processor 0. For this virtionet configuration, distributing IRQs to all
# processors results in comparatively high cpu utilization and comparatively

# low network bandwidth.
#
# For a multi-queue / MSI-X virtionet device, sets the IRQ affinities to the
# per-IRQ affinity hint. The virtionet driver maps each virtionet TX (RX) queue
# MSI-X interrupt to a unique single CPU if the number of TX (RX) queues equals
# the number of online CPUs. The mapping of network MSI-X interrupt vector to
# CPUs is stored in the virtionet MSI-X interrupt vector affinity hint. This
# configuration allows network traffic to be spread across the CPUs, giving
# each CPU a dedicated TX and RX network queue, while ensuring that all packets
# from a single flow are delivered to the same CPU.
#
# For a gvnic device, set the IRQ affinities to the per-IRQ affinity hint. 
# The google virtual ethernet driver maps each queue MSI-X interrupt to a 
# unique single CPU, which is stored in the affinity_hint for each MSI-X 
# vector. In older versions of the kernel, irqblanace is expected to copy the 
# affinity_hint to smp_affinity; however, GCE instances disable irqbalance by 
# default. This script copies over the affinity_hint to smp_affinity on boot to
# replicate the behavior of irqbalance.

function is_decimal_int() {
  [ "${1}" -eq "${1}" ] > /dev/null 2>&1
}

function set_channels() {
  ethtool -L "${1}" combined "${2}" > /dev/null 2>&1
}

function set_irq_range() {
  local -r nic="$1"
  local bind_cores_begin="$2"

  # The user may not have this $nic configured on their VM, if not, just skip
  # it, no need to error out.
  if [ ! -d "/sys/class/net/"$nic"/device" ]; then
    return;
  fi


  # We count the number of rx queues and assume number of rx queues == tx
  # queues. The number of queues shown in the sysfs stands for the initial
  # queues while the number of IRQs stands for the max queues. The number of
  # initial queues should be always less than or equal to that of the max
  # queues.
  core=$bind_cores_begin
  num_irqs=$(( $(ls /sys/class/net/"$nic"/device/msi_irqs | wc -l) / 2 ))
  num_q=$(ls -1 /sys/class/net/"$nic"/queues/ | grep rx | wc -l)

  echo "Setting irq binding for "$nic" to core [$core - $((core + num_q - 1))] ..."

  irqs=($(ls /sys/class/net/"$nic"/device/msi_irqs | sort -g))
  for ((irq = 0; irq < "$num_irqs"; irq++)); do
    tx_irq=${irqs[$irq]}
    rx_irq=${irqs[$((irq + num_irqs))]}

    # Only allocate $num_q cores to the IRQs and queues. If the number of IRQs
    # is more than that of queues, the CPUs will be wrapped around.
    core=$(( bind_cores_begin + irq % num_q ))

    # this is GVE's TX irq. See gve_tx_idx_to_ntfy().
    echo "$core" > /proc/irq/"$tx_irq"/smp_affinity_list

    # this is GVE's RX irq. See gve_rx_idx_to_ntfy().
    echo "$core" > /proc/irq/"$rx_irq"/smp_affinity_list

    # Check if the queue exists at present because the number of IRQs equals
    # the max number of queues allocated and could be greater than the current
    # number of queues.
    tx_queue=/sys/class/net/"$nic"/queues/tx-"$irq"
    if ls $tx_queue 1> /dev/null 2>&1; then
      # XPS (Transmit Packet Steering) allows a core to decide which queue to
      # select if its mask is found in one of the queue's xps_cpus
      cp /proc/irq/"$tx_irq"/smp_affinity $tx_queue/xps_cpus

      echo -en "$nic:q-$irq: \ttx: irq $tx_irq bind to $core \trx: irq $rx_irq bind to $core"
      echo -e " \txps_cpus bind to $(cat $tx_queue/xps_cpus)"
    else
      echo -e "$nic:q-$irq: \ttx: irq $tx_irq bind to $core \trx: irq $rx_irq bind to $core"
    fi

  done
}

# returns 0 (success) if the all the interfaces contains pnic_id on the Metadats server.
function contains_pnic_ids() {
  while IFS= read -r interface; do
    echo "Interface: $interface"

    network_interfaces_mds_attributes=$(curl -m 1 -H "Metadata-Flavor: Google" \
      "http://169.254.169.254/computeMetadata/v1/instance/network-interfaces/$interface/")

    if ! echo "$network_interfaces_mds_attributes" | grep -q "physical-nic-id"; then
      echo "physical-nic-id NOT found in interface $interface"
      return 1
    fi
  done < <(curl -m 1 -H "Metadata-Flavor: Google" http://169.254.169.254/computeMetadata/v1/instance/network-interfaces/)

  return 0
}

# returns 0 (success) if the platform is a multinic accelerator platform.
function is_multinic_accelerator_platform() {
  contains_pnic_ids
  CONTAINS_PNIC_IDS=$?

  machine_type=$(curl -m 1 -H "Metadata-Flavor: Google" \
    http://169.254.169.254/computeMetadata/v1/instance/machine-type)

  [[ $CONTAINS_PNIC_IDS -eq 0 \
  || "$machine_type" == *"a3-highgpu-8g"* \
  || "$machine_type" == *"a3-ultragpu-8g"* \
  || "$machine_type" == *"a3-megagpu-8g"* \
  || "$machine_type" == *"a3-edgegpu-8g"* \
  || "$machine_type" == *"a3-ultragpu-"* \
  || "$machine_type" == *"a4-highgpu-"* \
  || "$machine_type" == *"a4x-highgpu-"* ]] || return 1
  return 0
}


# returns 0 (success) if the supplied nic is a Gvnic device.
function is_gvnic() {
  local -r nic_name="$1"
  driver_type=$(ethtool -i $nic_name | grep driver)

  [[ "$driver_type" == *"gve"* 
  || "$driver_type" == *"gvnic"* ]] || return 1

  return 0
}

# Returns the vCPU ranges on each of the numa nodes. The vCPU ranges will
# be in the form of array of
# [numa0_irq_start_1, numa0_irq_end_1, numa0_irq_start_2, numa0_irq_end_2,
# numa1_irq_start_1, numa1_irq_end_1, numa1_irq_start_2, numa1_irq_end_2]
# this will only return the vCPU ranges on NUMA0 and NUMA1 since accelerator
# platforms of GEN3 and after only have 2 NUMA nodes.
# The expected vCPU ranges on eahc platforms are:
# A3/A3-mega: 
# numa0: [0, 51] [104, 155]
# numa1: [52, 103] [156, 207]
# A3-ultra:
# numa0: [0, 55] [113, 168]
# numa1: [56, 112] [169, 224]
function get_vcpu_ranges_on_accelerator_platform {
  declare -n arr_ref=$1

  # Get vCPU ranges for NUMA 0
  numa0_irq_range=($(cat /sys/devices/system/node/node0/cpulist))
  numa0_irq_range0="${numa0_irq_range[0]%,*}"
  numa0_irq_range1="${numa0_irq_range[0]#*,}"

  numa0_irq_range0_start=$(echo "$numa0_irq_range0" | cut -d '-' -f 1)
  # Avoid setting binding IRQ on vCPU 0 as it is a busy vCPU being heavily
  # used by the system.
  numa0_irq_range0_start=$((numa0_irq_range0_start + 1))
  
  numa0_irq_range0_end=$(echo "$numa0_irq_range0" | cut -d '-' -f 2)
  numa0_irq_range1_start=$(echo "$numa0_irq_range1" | cut -d '-' -f 1)
  numa0_irq_range1_end=$(echo "$numa0_irq_range1" | cut -d '-' -f 2)

  # Get vCPU ranges for NUMA 1
  numa1_irq_range=($(cat /sys/devices/system/node/node1/cpulist))
  numa1_irq_range0="${numa1_irq_range[0]%,*}"
  numa1_irq_range1="${numa1_irq_range[0]#*,}"

  numa1_irq_range0_start=$(echo "$numa1_irq_range0" | cut -d '-' -f 1)
  numa1_irq_range0_end=$(echo "$numa1_irq_range0" | cut -d '-' -f 2)
  numa1_irq_range1_start=$(echo "$numa1_irq_range1" | cut -d '-' -f 1)
  numa1_irq_range1_end=$(echo "$numa1_irq_range1" | cut -d '-' -f 2)

  arr_ref=(
  "$numa0_irq_range0_start"
  "$numa0_irq_range0_end"
  "$numa0_irq_range1_start"
  "$numa0_irq_range1_end"
  "$numa1_irq_range0_start"
  "$numa1_irq_range0_end"
  "$numa1_irq_range1_start"
  "$numa1_irq_range1_end")
}

echo "Running $(basename $0)."
VIRTIO_NET_DEVS=/sys/bus/virtio/drivers/virtio_net/virtio*
is_multinic_accelerator_platform
IS_MULTINIC_ACCELERATOR_PLATFORM=$?

# Loop through all the virtionet devices and enable multi-queue
if [ -x "$(command -v ethtool)" ]; then
  for dev in $VIRTIO_NET_DEVS; do
    ETH_DEVS=${dev}/net/*
    for eth_dev in $ETH_DEVS; do
      eth_dev=$(basename "$eth_dev")
      if ! errormsg=$(ethtool -l "$eth_dev" 2>&1); then
        echo "ethtool says that $eth_dev does not support virtionet multiqueue: $errormsg."
        continue
      fi
      num_max_channels=$(ethtool -l "$eth_dev" | grep -m 1 Combined | cut -f2)
      if [[ -n "${num_max_channels}" || "${num_max_channels}" -eq "1" ]]; then
        echo "num_max_channels is n/a, skipping set channels for $eth_dev"
        continue
      fi
      if is_decimal_int "$num_max_channels" && \
        set_channels "$eth_dev" "$num_max_channels"; then
        echo "Set channels for $eth_dev to $num_max_channels."
      else
        echo "Could not set channels for $eth_dev to $num_max_channels."
      fi
    done
  done
else
  echo "ethtool not found: cannot configure virtionet multiqueue."
fi

for dev in $VIRTIO_NET_DEVS
do
    dev=$(basename "$dev")
    irq_dir=/proc/irq/*
    for irq in $irq_dir
    do
      smp_affinity="${irq}/smp_affinity_list"
      [ ! -f "${smp_affinity}" ] && continue
      # Classify this IRQ as virtionet intx, virtionet MSI-X, or non-virtionet
      # If the IRQ type is virtionet intx, a subdirectory with the same name as
      # the device will be present. If the IRQ type is virtionet MSI-X, then
      # a subdirectory of the form <device name>-<input|output>.N will exist.
      # In this case, N is the input (output) queue number, and is specified as
      # a decimal integer ranging from 0 to K - 1 where K is the number of
      # input (output) queues in the virtionet device.
      virtionet_intx_dir="${irq}/${dev}"
      virtionet_msix_dir_regex=".*/${dev}-(input|output)\.([0-9]+)$"
      if [ -d "${virtionet_intx_dir}" ]; then
        # All virtionet intx IRQs are delivered to CPU 0
        echo "Setting ${smp_affinity} to 01 for device ${dev}."
        echo "01" > "${smp_affinity}"
        continue
      fi
      # Not virtionet intx, probe for MSI-X
      virtionet_msix_found=0
      for entry in ${irq}/${dev}*; do
        if [[ "$entry" =~ ${virtionet_msix_dir_regex} ]]; then
          virtionet_msix_found=1
          queue_num=${BASH_REMATCH[2]}
        fi
      done
      affinity_hint="${irq}/affinity_hint"
      [ "$virtionet_msix_found" -eq 0 -o ! -f "${affinity_hint}" ] && continue

      # Set the IRQ CPU affinity to the virtionet-initialized affinity hint
      echo "Setting ${smp_affinity} to ${queue_num} for device ${dev}."
      echo "${queue_num}" > "${smp_affinity}"
      real_affinity=`cat ${smp_affinity}`
      echo "${smp_affinity}: real affinity ${real_affinity}"
    done
done

# Set smp_affinity properly for gvnic queues. '-ntfy-block.' is unique to gve
# and will not affect virtio queues.
for i in /proc/irq/*; do
  if ls ${i}/*-ntfy-block.* 1> /dev/null 2>&1; then
    if [ -f ${i}/affinity_hint ]; then
      echo Setting smp_affinity on ${i} to $(cat ${i}/affinity_hint)
      cp ${i}/affinity_hint ${i}/smp_affinity
    fi
  fi
done

XPS=/sys/class/net/e*/queues/tx*/xps_cpus
num_cpus=$(nproc)
[[ $num_cpus -gt 63 ]] && num_cpus=63

num_queues=0
for q in $XPS; do
  interface=$(echo "$q" | grep -oP 'net/\K[^/]+')
  if [[ $IS_MULTINIC_ACCELERATOR_PLATFORM == 0 ]] && ! $(is_gvnic "$interface"); then
    continue
  fi
  num_queues=$((num_queues + 1))
done

# If we have more CPUs than queues, then stripe CPUs across tx affinity
# as CPUNumber % queue_count.
for q in $XPS; do
  interface=$(echo "$q" | grep -oP 'net/\K[^/]+')
  if [[ $IS_MULTINIC_ACCELERATOR_PLATFORM == 0 ]] && ! $(is_gvnic "$interface"); then
    continue
  fi

  queue_re=".*tx-([0-9]+).*$"
  if [[ "$q" =~ ${queue_re} ]]; then
    queue_num=${BASH_REMATCH[1]}
  fi

  xps=0
  for cpu in `seq $queue_num $num_queues $((num_cpus - 1))`; do
    xps=$((xps | (1 << cpu)))
  done

  # Linux xps_cpus requires a hex number with commas every 32 bits. It ignores
  # all bits above # cpus, so write a list of comma separated 32 bit hex values
  # with a comma between dwords.
  xps_dwords=()
  for i in $(seq 0 $(((num_cpus - 1) / 32)))
  do
    xps_dwords=(`printf "%08x" $((xps & 0xffffffff))` "${xps_dwords[@]}")
    xps=$((xps >> 32))
  done
  xps_string=$(IFS=, ; echo "${xps_dwords[*]}")


  echo ${xps_string} > $q
  printf "Queue %d XPS=%s for %s\n" $queue_num `cat $q` $q
done | sort -n -k2

if [[ ! $IS_MULTINIC_ACCELERATOR_PLATFORM == 0 ]]; then
  exit
fi


# Assign IRQ binding for network interfaces based on pci bus ordering.
# 
# Below logics explains how we rank interfaces by pci bus order.
# > find /sys/class/net -type l | xargs -L 1 realpath | sort
# /sys/devices/pci0000:00/0000:00:0b.0/net/enp0s11
# /sys/devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:02.0/0000:06:00.0/net/enp6s0
# /sys/devices/pci0000:07/0000:07:00.0/0000:08:00.0/0000:09:02.0/0000:0c:00.0/net/enp12s0
# /sys/devices/pci0000:81/0000:81:00.0/0000:82:00.0/0000:83:02.0/0000:86:00.0/net/enp134s0
# /sys/devices/pci0000:87/0000:87:00.0/0000:88:00.0/0000:89:02.0/0000:8c:00.0/net/enp140s0
# /sys/devices/virtual/net/lo
#
# > find /sys/class/net -type l | xargs -L 1 realpath | sort | xargs -L 1 basename | grep -v lo
# enp0s11
# enp6s0
# enp12s0
# enp134s0
# enp140s0

irq_ranges=()
get_vcpu_ranges_on_accelerator_platform irq_ranges
echo "Binding vCPUs on NUMA0 [${irq_ranges[0]} ${irq_ranges[1]}], [${irq_ranges[2]} ${irq_ranges[3]}]}"
echo "Binding vCPUs on NUMA1 [${irq_ranges[4]} ${irq_ranges[5]}], [${irq_ranges[6]} ${irq_ranges[7]}]}"

numa0_irq_start=${irq_ranges[0]}
find /sys/class/net -type l | xargs -L 1 realpath | grep '/sys/devices/pci' | sort | xargs -L 1 basename | while read nic_name; do
  # For non-gvnic devices (e.g. mlx5), the IRQ bindings will be handled by the device's driver.
  if ! is_gvnic "$nic_name"; then
    echo "$nic_name is not gvnic device, skipping set irq on this device"
    continue
  fi

  echo "$nic_name is Gvnic device, continuing set IRQ on $nic_name ."

  nic_numa_node=$(cat /sys/class/net/"$nic_name"/device/numa_node)
  if [[ $nic_numa_node -ne 0 ]]; then
    continue
  fi

  nic_num_queues=$(ls -1 /sys/class/net/"$nic_name"/queues/ | grep rx | wc -l)
  bind_cores_begin=$numa0_irq_start
  bind_cores_end=$((bind_cores_begin + nic_num_queues))

  if [[ $bind_cores_begin -lt ${irq_ranges[1]} ]] && [[ $bind_cores_end -gt ${irq_ranges[1]} ]]; then
    bind_cores_begin=${irq_ranges[2]}
    bind_cores_end=$((bind_cores_begin + nic_num_queues))
  fi

  set_irq_range "$nic_name" "$bind_cores_begin"

  numa0_irq_start=$bind_cores_end
done

numa1_irq_start=${irq_ranges[4]}
find /sys/class/net -type l | xargs -L 1 realpath | grep '/sys/devices/pci' | sort | xargs -L 1 basename | while read nic_name; do
  # For non-gvnic devices (e.g. mlx5), the IRQ bindings will be handled by the device's driver.
  if ! is_gvnic "$nic_name"; then
    echo "$nic_name is not gvnic device, skipping set irq on this device"
    continue
  fi

  echo "$nic_name is Gvnic device, continuing set IRQ on $nic_name ."

  nic_numa_node=$(cat /sys/class/net/"$nic_name"/device/numa_node)
  if [[ $nic_numa_node -ne 1 ]]; then
    continue
  fi

  nic_num_queues=$(ls -1 /sys/class/net/"$nic_name"/queues/ | grep rx | wc -l)
  bind_cores_begin=$numa1_irq_start
  bind_cores_end=$((bind_cores_begin + nic_num_queues))

  if [[ $bind_cores_begin -lt ${irq_ranges[5]} ]] && [[ $bind_cores_end -gt ${irq_ranges[5]} ]]; then
    bind_cores_begin=${irq_ranges[6]}
    bind_cores_end=$((bind_cores_begin + nic_num_queues))
  fi

  set_irq_range "$nic_name" "$bind_cores_begin"

  numa1_irq_start=$bind_cores_end
done