HEX
Server: Apache/2.4.65 (Ubuntu)
System: Linux ielts-store-v2 6.8.0-1036-gcp #38~22.04.1-Ubuntu SMP Thu Aug 14 01:19:18 UTC 2025 x86_64
User: root (0)
PHP: 7.2.34-54+ubuntu20.04.1+deb.sury.org+1
Disabled: pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,
Upload Files
File: //snap/google-cloud-cli/394/lib/surface/compute/instances/report_host_as_faulty.py
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Command for reporting a virtual machine instance as faulty."""

from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals

from googlecloudsdk.api_lib.compute import base_classes
from googlecloudsdk.api_lib.compute.operations import poller
from googlecloudsdk.api_lib.util import waiter
from googlecloudsdk.calliope import arg_parsers
from googlecloudsdk.calliope import base
from googlecloudsdk.calliope import exceptions
from googlecloudsdk.command_lib.compute.instances import flags
from googlecloudsdk.command_lib.util.apis import arg_utils
from googlecloudsdk.core import exceptions as core_exceptions
from googlecloudsdk.core import log


FAULT_REASONS_CHOICES = [
    'BEHAVIOR_UNSPECIFIED',
    'PERFORMANCE',
    'SILENT_DATA_CORRUPTION',
    'UNRECOVERABLE_GPU_ERROR',
]

DISRUPTION_SCHEDULE_DEFAULT_CHOICE = 'IMMEDIATE'
DISRUPTION_SCHEDULE_CHOICES = ['IMMEDIATE', 'FUTURE']

DETAILED_HELP = {
    'brief': 'Report a host as faulty to start the repair process.',
    'DESCRIPTION': """\
          *{command}* is used to report a host as faulty to start the repair
          process.
        """,
    'EXAMPLES': """\
        To report a host as faulty for an instance named ``test-instance'', run:

          $ {command} test-instance --fault-reasons=behavior=SILENT_DATA_CORRUPTION,description="affecting our ML job" \
          --disruption-schedule=IMMEDIATE \
        """,
}


def _ValidateFaultReasonsBehavior(behavior_input):
  """Validates behavior field, throws exception if invalid."""
  behavior = behavior_input.upper()
  if behavior not in FAULT_REASONS_CHOICES:
    raise exceptions.InvalidArgumentException(
        'behavior', 'Invalid value {} for behavior'.format(behavior_input)
    )
  return behavior


@base.ReleaseTracks(
    base.ReleaseTrack.ALPHA, base.ReleaseTrack.BETA, base.ReleaseTrack.GA
)
@base.UniverseCompatible
class ReportHostAsFaulty(base.SilentCommand):
  """Report a host as faulty to start the repair process."""

  @staticmethod
  def Args(parser):
    flags.INSTANCE_ARG.AddArgument(parser)
    ReportHostAsFaulty._AddDisruptionSchedule(parser)
    ReportHostAsFaulty._AddFaultReasons(parser)
    base.ASYNC_FLAG.AddToParser(parser)

  @staticmethod
  def _AddDisruptionSchedule(parser):
    parser.add_argument(
        '--disruption-schedule',
        required=True,
        choices=DISRUPTION_SCHEDULE_CHOICES,
        type=arg_utils.ChoiceToEnumName,
        help="""\
        Specifies the timing for initiating the fault reporting process.
        The default value is {default_choice} which initiates the process right away.
        """.format(default_choice=DISRUPTION_SCHEDULE_DEFAULT_CHOICE),
    )

  @staticmethod
  def _AddFaultReasons(parser):
    parser.add_argument(
        '--fault-reasons',
        type=arg_parsers.ArgDict(
            min_length=1,
            spec={
                'description': str,
                'behavior': _ValidateFaultReasonsBehavior,
            },
            required_keys=['behavior'],
        ),
        required=True,
        action='append',
        help="""\
        Specified and can include one or more of the following types:
        {choices}.
        This helps categorize the nature of the fault being reported.
        """.format(choices=FAULT_REASONS_CHOICES),
    )

  def _BuildRequest(self, args, instance_ref, client):
    fault_reasons = [
        client.messages.InstancesReportHostAsFaultyRequestFaultReason(
            behavior=arg_utils.ChoiceToEnum(
                reason.get('behavior'),
                client.messages.InstancesReportHostAsFaultyRequestFaultReason.BehaviorValueValuesEnum,
            ),
            description=reason.get('description'),
        )
        for reason in args.fault_reasons
    ]

    disruption_schedule_enum = arg_utils.ChoiceToEnum(
        args.disruption_schedule,
        client.messages.InstancesReportHostAsFaultyRequest.DisruptionScheduleValueValuesEnum,
    )

    request = client.messages.ComputeInstancesReportHostAsFaultyRequest(
        instance=instance_ref.Name(),
        project=instance_ref.project,
        zone=instance_ref.zone,
        instancesReportHostAsFaultyRequest=client.messages.InstancesReportHostAsFaultyRequest(
            disruptionSchedule=disruption_schedule_enum,
            faultReasons=fault_reasons,
        ),
    )
    return request

  def _Run(self, args, instance_refs, client, holder, is_async=False):
    if not isinstance(instance_refs, list):
      instance_refs = [instance_refs]

    requests = [
        (
            client.apitools_client.instances,
            'ReportHostAsFaulty',
            self._BuildRequest(args, instance_ref, client),
        )
        for instance_ref in instance_refs
    ]

    errors_to_collect = []
    responses = client.AsyncRequests(
        requests,
        errors_to_collect,
    )
    if errors_to_collect:
      raise core_exceptions.MultiError(errors_to_collect)

    operation_refs = [holder.resources.Parse(r.selfLink) for r in responses]

    if is_async:
      for operation_ref in operation_refs:
        log.status.Print(
            'The report host as faulty operation is currently in progress.'
            ' Operation URI: [{}].'.format(
                operation_ref.SelfLink()
            )
        )
      log.status.Print(
          'Use [gcloud compute operations describe URI] command to check the '
          'status of the operation(s).'
      )
      return responses
    else:
      operation_poller = poller.BatchPoller(
          client, client.apitools_client.instances, instance_refs
      )

      result = waiter.WaitFor(
          operation_poller,
          poller.OperationBatch(operation_refs),
          'Reporting host as faulty for instance(s) {0} in progress.'.format(
              ', '.join(i.Name() for i in instance_refs)
          ),
          max_wait_ms=None,
      )

      for instance_ref in instance_refs:
        log.status.Print(
            'Successfully reported host as faulty for instance: [{0}].'
            ' The operation has been completed.'.format(instance_ref.Name())
        )

      return result

  def Run(self, args):
    holder = base_classes.ComputeApiHolder(self.ReleaseTrack())
    client = holder.client

    instance_refs = flags.INSTANCE_ARG.ResolveAsResource(
        args,
        holder.resources,
        scope_lister=flags.GetInstanceZoneScopeLister(client),
    )

    return self._Run(args, instance_refs, client, holder, is_async=args.async_)


ReportHostAsFaulty.detailed_help = DETAILED_HELP