HEX

File: //snap/google-cloud-cli/current/lib/surface/storage/diagnose.py
# -*- coding: utf-8 -*- #
# Copyright 2024 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Diagnose Google Cloud Storage common issues."""

import enum
import os

from googlecloudsdk.api_lib.storage import errors as api_errors
from googlecloudsdk.api_lib.storage.gcs_json import client as gcs_json_client
from googlecloudsdk.calliope import arg_parsers
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.storage import errors as command_errors
from googlecloudsdk.command_lib.storage import errors_util
from googlecloudsdk.command_lib.storage import storage_url
from googlecloudsdk.command_lib.storage.diagnose import direct_connectivity_diagnostic
from googlecloudsdk.command_lib.storage.diagnose import download_throughput_diagnostic as download_throughput_diagnostic_lib
from googlecloudsdk.command_lib.storage.diagnose import export_util
from googlecloudsdk.command_lib.storage.diagnose import latency_diagnostic as latency_diagnostic_lib
from googlecloudsdk.command_lib.storage.diagnose import system_info
from googlecloudsdk.command_lib.storage.diagnose import upload_throughput_diagnostic as upload_throughput_diagnostic_lib
from googlecloudsdk.command_lib.storage.resources import gcs_resource_reference
from googlecloudsdk.core import log

_OBJECT_SIZE_UPPER_BOUND = '1GB'


def get_bucket_resource(
    bucket_url: storage_url.StorageUrl,
) -> gcs_resource_reference.GcsBucketResource:
  """Fetches the bucket resource for the given bucket storage URL.

  Args:
    bucket_url: The URL object to get the bucket resource for.

  Returns:
    The bucket resource for the given URL.

  Raises:
    FatalError: If the bucket resource could not be fetched.
  """
  gcs_client = gcs_json_client.JsonClient()
  try:
    return gcs_client.get_bucket(bucket_url.bucket_name)
  except api_errors.CloudApiError as e:
    raise command_errors.FatalError(
        f'Bucket metadata could not be fetched for {bucket_url.bucket_name}'
    ) from e


def _validate_args(args):
  """Validates and raises error if the command arguments are invalid."""
  errors_util.raise_error_if_not_gcs_bucket(
      args.command_path, storage_url.storage_url_from_string(args.url)
  )

  if (
      args.export
      and args.destination
      and not (
          os.path.exists(args.destination) and os.path.isdir(args.destination)
      )
  ):
    raise ValueError(
        f'Invalid destination path: {args.destination}. Please provide'
        ' a valid path.'
    )


class TestType(enum.Enum):
  """Enum class for specifying performance test type for diagnostic tests."""

  DIRECT_CONNECTIVITY = 'DIRECT_CONNECTIVITY'
  DOWNLOAD_THROUGHPUT = 'DOWNLOAD_THROUGHPUT'
  UPLOAD_THROUGHPUT = 'UPLOAD_THROUGHPUT'
  LATENCY = 'LATENCY'


@base.DefaultUniverseOnly
class Diagnose(base.Command):
  """Diagnose Google Cloud Storage."""

  detailed_help = {
      'DESCRIPTION': """
      The diagnose command runs a series of diagnostic tests for common gcloud
      storage issues.

      The `URL` argument must name an exisiting bucket for which the user
      already has write permissions. Standard billing also applies.
      Several test files/objects will be uploaded and downloaded to this bucket
      to gauge out the performance metrics. All the temporary files will be
      deleted on successfull completion of the command.

      By default, the command executes `DOWNLOAD_THROUGHPUT`,
      `UPLOAD_THROUGHPUT` and `LATENCY` tests. Tests to execute can be overriden
      by using the `--test-type` flag.
      Each test uses the command defaults or gcloud CLI configurations for
      performing the operations. This command also provides a way to override
      these values via means of different flags like `--process-count`,
      `--thread-count`, `--download-type`, etc.

      The command outputs a diagnostic report with sytem information like free
      memory, available CPU, average CPU load per test, disk counter deltas and
      diagnostic information specific to individual tests on successful
      completion.

      """,
      'EXAMPLES': """

      The following command runs the default diagnostic tests on ``my-bucket''
      bucket:

      $ {command} gs://my-bucket

      The following command runs only UPLOAD_THROUGHPUT and DOWNLOAD_THROUGHPUT
      diagnostic tests:

      $ {command} gs://my-bucket --test-type=UPLOAD_THROUGHPUT,DOWNLOAD_THROUGHPUT

      The following command runs the diagnostic tests using ``10'' objects of
      ``1MiB'' size each with ``10'' threads and ``10'' processes at max:

      $ {command} gs://my-bucket --no-of-objects=10 --object-size=1MiB
      --process-count=10 --thread-count=10

      The following command can be used to bundle and export the diagnostic
      information to a user defined ``PATH'' destination:

      $ {command} gs://my-bucket --export --destination=<PATH>
      """,
  }

  @classmethod
  def Args(cls, parser):
    parser.SetSortArgs(False)

    parser.add_argument(
        'url',
        type=str,
        help='Bucket URL to use for the diagnostic tests.',
    )
    parser.add_argument(
        '--test-type',
        type=arg_parsers.ArgList(
            choices=sorted([option.value for option in TestType])
        ),
        metavar='TEST_TYPES',
        help="""
        Tests to run as part of this diagnosis. Following tests are supported:

        DIRECT_CONNECTIVITY: Run a test upload over the Direct Connectivity
        network path and run other diagnostics if the upload fails.

        DOWNLOAD_THROUGHPUT: Upload objects to the specified bucket and record
        the number of bytes transferred per second.

        UPLOAD_THROUGHPUT: Download objects from the specified bucket and
        record the number of bytes transferred per second.

        LATENCY: Write the objects, retrieve their metadata, read the objects,
        and record latency of each operation.
        """,
        default=[],
    )
    parser.add_argument(
        '--download-type',
        choices=sorted([
            option.value
            for option in download_throughput_diagnostic_lib.DownloadType
        ]),
        default=download_throughput_diagnostic_lib.DownloadType.FILE,
        help="""
        Download strategy to use for the DOWNLOAD_THROUGHPUT diagnostic test.

        STREAMING: Downloads the file in memory, does not use parallelism.
        `--process-count` and `--thread-count` flag values will be ignored if
        provided.

        SLICED: Performs a [sliced download](https://cloud.google.com/storage/docs/sliced-object-downloads)
        of objects to a directory.
        Parallelism can be controlled via `--process-count` and `--thread-count`
        flags.

        FILE: Download objects as files. Parallelism can be controlled via
        `--process-count` and `--thread-count` flags.
        """,
    )
    parser.add_argument(
        '--logs-path',
        help=(
            'If the diagnostic supports writing logs, write the logs to this'
            ' file location.'
        ),
    )
    parser.add_argument(
        '--upload-type',
        choices=sorted([
            option.value
            for option in upload_throughput_diagnostic_lib.UploadType
        ]),
        default=upload_throughput_diagnostic_lib.UploadType.FILE,
        help="""
        Upload strategy to use for the _UPLOAD_THROUGHPUT_ diagnostic test.

        FILE: Uploads files to a bucket. Parallelism can be controlled via
        `--process-count` and `--thread-count` flags.

        PARALLEL_COMPOSITE: Uploads files using a [parallel
        composite strategy](https://cloud.google.com/storage/docs/parallel-composite-uploads).
        Parallelism can be controlled via `--process-count` and `--thread-count`
        flags.

        STREAMING: Streams the data to the bucket, does not use parallelism.
        `--process-count` and `--thread-count` flag values will be ignored if
        provided.
        """,
    )

    parser.add_argument(
        '--process-count',
        type=arg_parsers.BoundedInt(lower_bound=1),
        help='Number of processes at max to use for each diagnostic test.',
    )
    parser.add_argument(
        '--thread-count',
        type=arg_parsers.BoundedInt(lower_bound=1),
        help='Number of threads at max to use for each diagnostic test.',
    )

    object_properties_group = parser.add_group(
        sort_args=False, help='Object properties:'
    )

    object_properties_group.add_argument(
        '--object-count',
        required=True,
        type=arg_parsers.BoundedInt(lower_bound=1),
        help='Number of objects to use for each diagnostic test.',
    )

    object_size_properties_group = object_properties_group.add_group(
        mutex=True,
        sort_args=False,
        help='Object size properties:',
        required=True,
    )
    object_size_properties_group.add_argument(
        '--object-size',
        type=arg_parsers.BinarySize(upper_bound=_OBJECT_SIZE_UPPER_BOUND),
        help='Object size to use for the diagnostic tests.',
    )
    object_size_properties_group.add_argument(
        '--object-sizes',
        metavar='OBJECT_SIZES',
        type=arg_parsers.ArgList(
            element_type=arg_parsers.BinarySize(
                upper_bound=_OBJECT_SIZE_UPPER_BOUND
            )
        ),
        help="""
        List of object sizes to use for the tests. Sizes should be
        provided for each object specified using `--object-count` flag.
        """,
    )

    export_group = parser.add_group(
        sort_args=False, help='Export diagnostic bundle.'
    )
    export_group.add_argument(
        '--export',
        action='store_true',
        required=True,
        help="""
        Generate and export a diagnostic bundle. The following
        information will be bundled and exported into a gzipped tarball
        (.tar.gz):

        - Latest gcloud CLI logs.
        - Output of running the `gcloud storage diagnose` command.
        - Output of running the `gcloud info --anonymize` command.

        Note: This command generates a bundle containing system information like
        disk counter detlas, CPU information and system configurations. Please
        exercise caution while sharing.
        """,
    )
    export_group.add_argument(
        '--destination',
        type=str,
        help=(
            'Destination file path where the diagnostic bundle will be'
            ' exported.'
        ),
    )
    parser.display_info.AddFormat("""
                                  table(
                                    name,
                                    operation_results[]:format='table[box](name,payload_description:wrap,result:wrap)'
                                  )
                                  """)

  def _run_tests_with_performance_tracking(
      self, args, url_object, tests_to_run
  ):
    """Runs test with system performance tracking."""
    object_sizes = None

    if args.object_count:
      if args.object_sizes:
        if len(args.object_sizes) != args.object_count:
          raise ValueError(
              'Number of object sizes provided should match the number of'
              ' objects.'
          )
        else:
          object_sizes = args.object_sizes
      elif args.object_size:
        object_sizes = [args.object_size] * args.object_count

    system_info_provider = system_info.get_system_info_provider()
    test_results = []
    with system_info.get_disk_io_stats_delta_diagnostic_result(
        system_info_provider, test_results
    ):
      if TestType.LATENCY.value in tests_to_run:
        latency_diagnostic = latency_diagnostic_lib.LatencyDiagnostic(
            url_object,
            object_sizes,
        )
        latency_diagnostic.execute()
        test_results.append(latency_diagnostic.result)

      if TestType.DOWNLOAD_THROUGHPUT.value in tests_to_run:
        download_type = download_throughput_diagnostic_lib.DownloadType(
            args.download_type
        )
        download_throughput_diagnostic = (
            download_throughput_diagnostic_lib.DownloadThroughputDiagnostic(
                url_object,
                download_type,
                object_sizes,
                process_count=args.process_count,
                thread_count=args.thread_count,
            )
        )
        download_throughput_diagnostic.execute()
        test_results.append(download_throughput_diagnostic.result)

      if TestType.UPLOAD_THROUGHPUT.value in tests_to_run:
        upload_type = upload_throughput_diagnostic_lib.UploadType(
            args.upload_type
        )
        upload_throughput_diagnostic = (
            upload_throughput_diagnostic_lib.UploadThroughputDiagnostic(
                url_object,
                upload_type,
                object_sizes,
                process_count=args.process_count,
                thread_count=args.thread_count,
            )
        )
        upload_throughput_diagnostic.execute()
        test_results.append(upload_throughput_diagnostic.result)

      # Capture the system information at last to CPU load avg could account for
      # the diagnostic test runs.
      test_results.append(
          system_info.get_system_info_diagnostic_result(system_info_provider)
      )
      return test_results

  def Run(self, args):

    default_tests = [
        TestType.DOWNLOAD_THROUGHPUT.value,
        TestType.LATENCY.value,
        TestType.UPLOAD_THROUGHPUT.value,
    ]

    _validate_args(args)
    url_object = storage_url.storage_url_from_string(args.url)
    bucket_resource = get_bucket_resource(url_object)

    log.status.Print(
        f'Using {bucket_resource.name} bucket for the diagnostic tests.'
    )
    log.status.Print(f'Bucket location : {bucket_resource.location}')
    log.status.Print(
        f'Bucket storage class : {bucket_resource.default_storage_class}'
    )

    if args.test_type:
      tests_to_run = args.test_type
    else:
      tests_to_run = default_tests

    if tests_to_run == [TestType.DIRECT_CONNECTIVITY.value]:
      test_results = []
    else:
      test_results = self._run_tests_with_performance_tracking(
          args, url_object, tests_to_run
      )

    if TestType.DIRECT_CONNECTIVITY.value in tests_to_run:
      direct_connectivity = (
          direct_connectivity_diagnostic.DirectConnectivityDiagnostic(
              bucket_resource,
              logs_path=args.logs_path,
          )
      )
      direct_connectivity.execute()
      test_results.append(direct_connectivity.result)

    if args.export:
      log.status.Print('Exporting diagnostic bundle...')
      export_path = export_util.export_diagnostic_bundle(
          test_results, args.destination
      )
      log.status.Print(
          'Successfully exported diagnostic bundle to {}'.format(export_path)
      )
      return None

    log.status.Print('Generating diagnostic report...')
    return test_results