HEX

File: //snap/google-cloud-cli/394/lib/surface/ai/model_garden/models/deploy.py
# -*- coding: utf-8 -*- #
# Copyright 2024 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model Garden deploy command."""

from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals

import time

from apitools.base.py import exceptions as apitools_exceptions
from googlecloudsdk.api_lib.ai import operations
from googlecloudsdk.api_lib.ai.model_garden import client as client_mg
from googlecloudsdk.api_lib.util import apis
from googlecloudsdk.calliope import arg_parsers
from googlecloudsdk.calliope import base
from googlecloudsdk.calliope import exceptions as c_exceptions
from googlecloudsdk.command_lib.ai import constants
from googlecloudsdk.command_lib.ai import endpoint_util
from googlecloudsdk.command_lib.ai import flags
from googlecloudsdk.command_lib.ai import model_garden_utils
from googlecloudsdk.command_lib.ai import region_util
from googlecloudsdk.command_lib.ai import validation
from googlecloudsdk.core import properties


@base.ReleaseTracks(
    base.ReleaseTrack.ALPHA, base.ReleaseTrack.BETA, base.ReleaseTrack.GA
)
@base.UniverseCompatible
class Deploy(base.Command):
  """Deploy a model in Model Garden to a Vertex AI endpoint.

  ## EXAMPLES

  To deploy a Model Garden model `google/gemma2/gemma2-9b` under project
  `example` in region
  `us-central1`, run:

    $ gcloud ai model-garden models deploy
    --model=google/gemma2@gemma-2-9b
    --project=example
    --region=us-central1

  To deploy a Hugging Face model `meta-llama/Meta-Llama-3-8B` under project
  `example` in region `us-central1`, run:

    $ gcloud ai model-garden models deploy
    --model=meta-llama/Meta-Llama-3-8B
    --hugging-face-access-token={hf_token}
    --project=example
    --region=us-central1
  """

  @staticmethod
  def Args(parser):
    base.Argument(
        '--model',
        required=True,
        help=(
            'The model to be deployed. If it is a Model Garden model, it should'
            ' be in the format of'
            ' `{publisher_name}/{model_name}@{model_version_name}, e.g.'
            ' `google/gemma2@gemma-2-2b`. If it is a Hugging Face model, it'
            ' should be in the convention of Hugging Face models, e.g.'
            ' `meta-llama/Meta-Llama-3-8B`. If it is a Custom Weights model, it'
            ' should be in the format of `gs://{gcs_bucket_uri}`, e.g. `gs://'
            '-model-garden-public-us/llama3.1/Meta-Llama-3.1-8B-Instruct`.'
        ),
    ).AddToParser(parser)
    base.Argument(
        '--hugging-face-access-token',
        required=False,
        help=(
            'The access token from Hugging Face needed to read the'
            ' model artifacts of gated models. It is only needed when'
            ' the Hugging Face model to deploy is gated.'
        ),
    ).AddToParser(parser)
    base.Argument(
        '--endpoint-display-name',
        required=False,
        help='Display name of the endpoint with the deployed model.',
    ).AddToParser(parser)

    flags.AddRegionResourceArg(
        parser, 'to deploy the model', prompt_func=region_util.PromptForOpRegion
    )
    base.Argument(
        '--machine-type',
        help=(
            'The machine type to deploy the model to. It should be a supported'
            ' machine type from the deployment configurations of the model. Use'
            ' `gcloud ai model-garden models list-deployment-config` to check'
            ' the supported machine types.'
        ),
        required=False,
    ).AddToParser(parser)
    base.Argument(
        '--accelerator-type',
        help=(
            'The accelerator type to serve the model. It should be a supported'
            ' accelerator type from the verified deployment configurations of'
            ' the model. Use `gcloud ai model-garden models'
            ' list-deployment-config` to check the supported accelerator types.'
        ),
        required=False,
    ).AddToParser(parser)
    base.Argument(
        '--accelerator-count',
        help=(
            'The accelerator count to serve the model. Accelerator count'
            ' should be non-negative.'
        ),
        type=int,
        required=False,
    ).AddToParser(parser)
    base.Argument(
        '--accept-eula',
        help=(
            'When set, the user accepts the End User License Agreement (EULA)'
            ' of the model.'
        ),
        action='store_true',
        default=False,
        required=False,
    ).AddToParser(parser)
    base.Argument(
        '--asynchronous',
        help=(
            'If set to true, the command will terminate immediately and not'
            ' keep polling the operation status.'
        ),
        action='store_true',
        default=False,
        required=False,
    ).AddToParser(parser)
    base.Argument(
        '--reservation-affinity',
        type=arg_parsers.ArgDict(
            spec={
                'reservation-affinity-type': str,
                'key': str,
                'values': arg_parsers.ArgList(),
            },
            required_keys=['reservation-affinity-type'],
        ),
        help=(
            'A ReservationAffinity can be used to configure a Vertex AI'
            ' resource (e.g., a DeployedModel) to draw its Compute Engine'
            ' resources from a Shared Reservation, or exclusively from'
            ' on-demand capacity.'
        ),
    ).AddToParser(parser)

    base.Argument(
        '--spot',
        action='store_true',
        default=False,
        required=False,
        help='If true, schedule the deployment workload on Spot VM.',
    ).AddToParser(parser)

    base.Argument(
        '--use-dedicated-endpoint',
        action='store_true',
        default=False,
        required=False,
        help=(
            'If true, the endpoint will be exposed through a dedicated DNS.'
            ' Your request to the dedicated DNS will be isolated from other'
            " users' traffic and will have better performance and reliability."
        ),
    ).AddToParser(parser)

    base.Argument(
        '--enable-fast-tryout',
        action='store_true',
        default=False,
        required=False,
        help=(
            'If True, model will be deployed using faster deployment path.'
            ' Useful for quick experiments. Not for production workloads. Only'
            ' available for most popular models with certain machine types.'
        ),
    ).AddToParser(parser)

    base.Argument(
        '--container-image-uri',
        help=("""\
      URI of the Model serving container file in the Container Registry
      (e.g. gcr.io/myproject/server:latest).
      """),
    ).AddToParser(parser)

    parser.add_argument(
        '--container-env-vars',
        metavar='KEY=VALUE',
        type=arg_parsers.ArgDict(),
        action=arg_parsers.UpdateAction,
        help='List of key-value pairs to set as environment variables.',
    )
    parser.add_argument(
        '--container-command',
        type=arg_parsers.ArgList(),
        metavar='COMMAND',
        action=arg_parsers.UpdateAction,
        help="""\
  Entrypoint for the container image. If not specified, the container
  image's default entrypoint is run.
  """,
    )
    parser.add_argument(
        '--container-args',
        metavar='ARG',
        type=arg_parsers.ArgList(),
        help="""\
  Comma-separated arguments passed to the command run by the container
  image. If not specified and no `--command` is provided, the container
  image's default command is used.
  """,
    )
    parser.add_argument(
        '--container-ports',
        metavar='PORT',
        type=arg_parsers.ArgList(element_type=arg_parsers.BoundedInt(1, 65535)),
        action=arg_parsers.UpdateAction,
        help="""\
  Container ports to receive http requests at. Must be a number between 1 and
  65535, inclusive.
  """,
    )
    parser.add_argument(
        '--container-grpc-ports',
        metavar='PORT',
        type=arg_parsers.ArgList(element_type=arg_parsers.BoundedInt(1, 65535)),
        action=arg_parsers.UpdateAction,
        help="""\
  Container ports to receive grpc requests at. Must be a number between 1 and
  65535, inclusive.
  """,
    )
    parser.add_argument(
        '--container-predict-route',
        help='HTTP path to send prediction requests to inside the container.',
    )
    parser.add_argument(
        '--container-health-route',
        help='HTTP path to send health checks to inside the container.',
    )
    parser.add_argument(
        '--container-deployment-timeout-seconds',
        type=int,
        help='Deployment timeout in seconds.',
    )
    parser.add_argument(
        '--container-shared-memory-size-mb',
        type=int,
        help="""\
  The amount of the VM memory to reserve as the shared memory for the model in
  megabytes.
    """,
    )
    parser.add_argument(
        '--container-startup-probe-exec',
        type=arg_parsers.ArgList(),
        metavar='STARTUP_PROBE_EXEC',
        help="""\
  Exec specifies the action to take. Used by startup probe. An example of this
  argument would be ["cat", "/tmp/healthy"].
    """,
    )
    parser.add_argument(
        '--container-startup-probe-period-seconds',
        type=int,
        help="""\
  How often (in seconds) to perform the startup probe. Default to 10 seconds.
  Minimum value is 1.
    """,
    )
    parser.add_argument(
        '--container-startup-probe-timeout-seconds',
        type=int,
        help="""\
  Number of seconds after which the startup probe times out. Defaults to 1 second.
  Minimum value is 1.
    """,
    )
    parser.add_argument(
        '--container-health-probe-exec',
        type=arg_parsers.ArgList(),
        metavar='HEALTH_PROBE_EXEC',
        help="""\
  Exec specifies the action to take. Used by health probe. An example of this
  argument would be ["cat", "/tmp/healthy"].
    """,
    )
    parser.add_argument(
        '--container-health-probe-period-seconds',
        type=int,
        help="""\
  How often (in seconds) to perform the health probe. Default to 10 seconds.
  Minimum value is 1.
    """,
    )
    parser.add_argument(
        '--container-health-probe-timeout-seconds',
        type=int,
        help="""\
  Number of seconds after which the health probe times out. Defaults to 1 second.
  Minimum value is 1.
    """,
    )

  def Run(self, args):
    is_custom_weights_model = args.model.startswith('gs://')
    if not is_custom_weights_model:
      validation.ValidateModelGardenModelArgs(args)
    validation.ValidateDisplayName(args.endpoint_display_name)
    region_ref = args.CONCEPTS.region.Parse()
    args.region = region_ref.AsDict()['locationsId']
    version = constants.BETA_VERSION
    is_hf_model = '@' not in args.model

    with endpoint_util.AiplatformEndpointOverrides(
        version, region='us-central1'
    ):
      # Custom weights model deployment.
      if is_custom_weights_model:

        if not (
            bool(args.machine_type)
            == bool(args.accelerator_type)
            == bool(args.accelerator_count)
        ):
          raise c_exceptions.InvalidArgumentException(
              '--machine-type, --accelerator-type and --accelerator-count',
              ' Arguments for MachineType, AcceleratorType and AcceleratorCount'
              ' must either all be provided or all be empty for custom weights'
              ' model deployment.',
          )

        machine_spec = None
        # Check accelerator quota.
        if args.machine_type:
          model_garden_utils.CheckAcceleratorQuota(
              args,
              machine_type=args.machine_type,
              accelerator_type=args.accelerator_type,
              accelerator_count=args.accelerator_count,
          )
          client = apis.GetClientInstance(
              constants.AI_PLATFORM_API_NAME,
              constants.AI_PLATFORM_API_VERSION[version],
          )

          machine_spec = client.MESSAGES_MODULE.GoogleCloudAiplatformV1beta1MachineSpec(
              machineType=args.machine_type,
              acceleratorType=client.MESSAGES_MODULE.GoogleCloudAiplatformV1beta1MachineSpec.AcceleratorTypeValueValuesEnum(
                  args.accelerator_type
              ),
              acceleratorCount=args.accelerator_count,
          )

        # Deploy the model.
        with endpoint_util.AiplatformEndpointOverrides(
            version, region=args.region
        ):
          default_endpoint_name = '-'.join([
              'custom-weights',
              str(time.time()).split('.')[0],
              'mg-cli-deploy',
          ])
          mg_client = client_mg.ModelGardenClient()
          operation_client = operations.OperationsClient(version=version)
          endpoint_name = (
              args.endpoint_display_name
              if args.endpoint_display_name
              else default_endpoint_name
          )

          model_garden_utils.Deploy(
              args,
              machine_spec,
              endpoint_name,
              args.model,
              operation_client,
              mg_client,
          )
      else:
        # Model Garden model deployment.
        # Step 1: Fetch PublisherModel data, including deployment configs. Use
        # us-central1 because all data are stored in us-central1.
        mg_client = client_mg.ModelGardenClient()
        if is_hf_model:
          # Convert to lower case because API only takes in lower case.
          publisher_name, model_name = args.model.lower().split('/')

          try:
            publisher_model = mg_client.GetPublisherModel(
                model_name=f'publishers/{publisher_name}/models/{model_name}',
                is_hugging_face_model=True,
            )
          except apitools_exceptions.HttpNotFoundError:
            raise c_exceptions.UnknownArgumentException(
                '--model',
                f'{args.model} is not a supported Hugging Face'
                ' model for deployment in Model Garden.',
            )

          default_endpoint_name = '-'.join(
              [publisher_name, model_name, 'hf', 'mg-cli-deploy']
          )
          api_model_arg = f'{publisher_name}/{model_name}'
        else:
          # Convert to lower case because API only takes in lower case.
          publisher_name, model_and_version_name = args.model.lower().split('/')

          try:
            publisher_model = mg_client.GetPublisherModel(
                f'publishers/{publisher_name}/models/{model_and_version_name}'
            )
          except apitools_exceptions.HttpNotFoundError:
            raise c_exceptions.UnknownArgumentException(
                '--model',
                f'{args.model} is not a supported Model Garden model for'
                ' deployment in Model Garden.',
            )

          default_endpoint_name = '-'.join([
              publisher_name,
              model_and_version_name.split('@')[1],
              'mg-cli-deploy',
          ])
          api_model_arg = (
              f'publishers/{publisher_name}/models/{model_and_version_name}'
          )

        deploy_config = model_garden_utils.GetDeployConfig(
            args, publisher_model
        )

        # Step 2: Check accelerator quota.
        model_garden_utils.CheckAcceleratorQuota(
            args,
            machine_type=deploy_config.dedicatedResources.machineSpec.machineType,
            accelerator_type=str(
                deploy_config.dedicatedResources.machineSpec.acceleratorType
            ),
            accelerator_count=deploy_config.dedicatedResources.machineSpec.acceleratorCount,
        )
        # Clear the aiplatform URI value so that new values can be set.
        properties.VALUES.api_endpoint_overrides.aiplatform.Set(None)

        # Step 3: Deploy the model.
        with endpoint_util.AiplatformEndpointOverrides(
            version, region=args.region
        ):
          mg_client = client_mg.ModelGardenClient()
          operation_client = operations.OperationsClient(version=version)
          endpoint_name = (
              args.endpoint_display_name
              if args.endpoint_display_name
              else default_endpoint_name
          )

          model_garden_utils.Deploy(
              args,
              deploy_config.dedicatedResources.machineSpec,
              endpoint_name,
              api_model_arg,
              operation_client,
              mg_client,
          )