File: //snap/google-cloud-cli/394/lib/surface/ai/model_garden/models/deploy.py
# -*- coding: utf-8 -*- #
# Copyright 2024 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model Garden deploy command."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import time
from apitools.base.py import exceptions as apitools_exceptions
from googlecloudsdk.api_lib.ai import operations
from googlecloudsdk.api_lib.ai.model_garden import client as client_mg
from googlecloudsdk.api_lib.util import apis
from googlecloudsdk.calliope import arg_parsers
from googlecloudsdk.calliope import base
from googlecloudsdk.calliope import exceptions as c_exceptions
from googlecloudsdk.command_lib.ai import constants
from googlecloudsdk.command_lib.ai import endpoint_util
from googlecloudsdk.command_lib.ai import flags
from googlecloudsdk.command_lib.ai import model_garden_utils
from googlecloudsdk.command_lib.ai import region_util
from googlecloudsdk.command_lib.ai import validation
from googlecloudsdk.core import properties
@base.ReleaseTracks(
base.ReleaseTrack.ALPHA, base.ReleaseTrack.BETA, base.ReleaseTrack.GA
)
@base.UniverseCompatible
class Deploy(base.Command):
"""Deploy a model in Model Garden to a Vertex AI endpoint.
## EXAMPLES
To deploy a Model Garden model `google/gemma2/gemma2-9b` under project
`example` in region
`us-central1`, run:
$ gcloud ai model-garden models deploy
--model=google/gemma2@gemma-2-9b
--project=example
--region=us-central1
To deploy a Hugging Face model `meta-llama/Meta-Llama-3-8B` under project
`example` in region `us-central1`, run:
$ gcloud ai model-garden models deploy
--model=meta-llama/Meta-Llama-3-8B
--hugging-face-access-token={hf_token}
--project=example
--region=us-central1
"""
@staticmethod
def Args(parser):
base.Argument(
'--model',
required=True,
help=(
'The model to be deployed. If it is a Model Garden model, it should'
' be in the format of'
' `{publisher_name}/{model_name}@{model_version_name}, e.g.'
' `google/gemma2@gemma-2-2b`. If it is a Hugging Face model, it'
' should be in the convention of Hugging Face models, e.g.'
' `meta-llama/Meta-Llama-3-8B`. If it is a Custom Weights model, it'
' should be in the format of `gs://{gcs_bucket_uri}`, e.g. `gs://'
'-model-garden-public-us/llama3.1/Meta-Llama-3.1-8B-Instruct`.'
),
).AddToParser(parser)
base.Argument(
'--hugging-face-access-token',
required=False,
help=(
'The access token from Hugging Face needed to read the'
' model artifacts of gated models. It is only needed when'
' the Hugging Face model to deploy is gated.'
),
).AddToParser(parser)
base.Argument(
'--endpoint-display-name',
required=False,
help='Display name of the endpoint with the deployed model.',
).AddToParser(parser)
flags.AddRegionResourceArg(
parser, 'to deploy the model', prompt_func=region_util.PromptForOpRegion
)
base.Argument(
'--machine-type',
help=(
'The machine type to deploy the model to. It should be a supported'
' machine type from the deployment configurations of the model. Use'
' `gcloud ai model-garden models list-deployment-config` to check'
' the supported machine types.'
),
required=False,
).AddToParser(parser)
base.Argument(
'--accelerator-type',
help=(
'The accelerator type to serve the model. It should be a supported'
' accelerator type from the verified deployment configurations of'
' the model. Use `gcloud ai model-garden models'
' list-deployment-config` to check the supported accelerator types.'
),
required=False,
).AddToParser(parser)
base.Argument(
'--accelerator-count',
help=(
'The accelerator count to serve the model. Accelerator count'
' should be non-negative.'
),
type=int,
required=False,
).AddToParser(parser)
base.Argument(
'--accept-eula',
help=(
'When set, the user accepts the End User License Agreement (EULA)'
' of the model.'
),
action='store_true',
default=False,
required=False,
).AddToParser(parser)
base.Argument(
'--asynchronous',
help=(
'If set to true, the command will terminate immediately and not'
' keep polling the operation status.'
),
action='store_true',
default=False,
required=False,
).AddToParser(parser)
base.Argument(
'--reservation-affinity',
type=arg_parsers.ArgDict(
spec={
'reservation-affinity-type': str,
'key': str,
'values': arg_parsers.ArgList(),
},
required_keys=['reservation-affinity-type'],
),
help=(
'A ReservationAffinity can be used to configure a Vertex AI'
' resource (e.g., a DeployedModel) to draw its Compute Engine'
' resources from a Shared Reservation, or exclusively from'
' on-demand capacity.'
),
).AddToParser(parser)
base.Argument(
'--spot',
action='store_true',
default=False,
required=False,
help='If true, schedule the deployment workload on Spot VM.',
).AddToParser(parser)
base.Argument(
'--use-dedicated-endpoint',
action='store_true',
default=False,
required=False,
help=(
'If true, the endpoint will be exposed through a dedicated DNS.'
' Your request to the dedicated DNS will be isolated from other'
" users' traffic and will have better performance and reliability."
),
).AddToParser(parser)
base.Argument(
'--enable-fast-tryout',
action='store_true',
default=False,
required=False,
help=(
'If True, model will be deployed using faster deployment path.'
' Useful for quick experiments. Not for production workloads. Only'
' available for most popular models with certain machine types.'
),
).AddToParser(parser)
base.Argument(
'--container-image-uri',
help=("""\
URI of the Model serving container file in the Container Registry
(e.g. gcr.io/myproject/server:latest).
"""),
).AddToParser(parser)
parser.add_argument(
'--container-env-vars',
metavar='KEY=VALUE',
type=arg_parsers.ArgDict(),
action=arg_parsers.UpdateAction,
help='List of key-value pairs to set as environment variables.',
)
parser.add_argument(
'--container-command',
type=arg_parsers.ArgList(),
metavar='COMMAND',
action=arg_parsers.UpdateAction,
help="""\
Entrypoint for the container image. If not specified, the container
image's default entrypoint is run.
""",
)
parser.add_argument(
'--container-args',
metavar='ARG',
type=arg_parsers.ArgList(),
help="""\
Comma-separated arguments passed to the command run by the container
image. If not specified and no `--command` is provided, the container
image's default command is used.
""",
)
parser.add_argument(
'--container-ports',
metavar='PORT',
type=arg_parsers.ArgList(element_type=arg_parsers.BoundedInt(1, 65535)),
action=arg_parsers.UpdateAction,
help="""\
Container ports to receive http requests at. Must be a number between 1 and
65535, inclusive.
""",
)
parser.add_argument(
'--container-grpc-ports',
metavar='PORT',
type=arg_parsers.ArgList(element_type=arg_parsers.BoundedInt(1, 65535)),
action=arg_parsers.UpdateAction,
help="""\
Container ports to receive grpc requests at. Must be a number between 1 and
65535, inclusive.
""",
)
parser.add_argument(
'--container-predict-route',
help='HTTP path to send prediction requests to inside the container.',
)
parser.add_argument(
'--container-health-route',
help='HTTP path to send health checks to inside the container.',
)
parser.add_argument(
'--container-deployment-timeout-seconds',
type=int,
help='Deployment timeout in seconds.',
)
parser.add_argument(
'--container-shared-memory-size-mb',
type=int,
help="""\
The amount of the VM memory to reserve as the shared memory for the model in
megabytes.
""",
)
parser.add_argument(
'--container-startup-probe-exec',
type=arg_parsers.ArgList(),
metavar='STARTUP_PROBE_EXEC',
help="""\
Exec specifies the action to take. Used by startup probe. An example of this
argument would be ["cat", "/tmp/healthy"].
""",
)
parser.add_argument(
'--container-startup-probe-period-seconds',
type=int,
help="""\
How often (in seconds) to perform the startup probe. Default to 10 seconds.
Minimum value is 1.
""",
)
parser.add_argument(
'--container-startup-probe-timeout-seconds',
type=int,
help="""\
Number of seconds after which the startup probe times out. Defaults to 1 second.
Minimum value is 1.
""",
)
parser.add_argument(
'--container-health-probe-exec',
type=arg_parsers.ArgList(),
metavar='HEALTH_PROBE_EXEC',
help="""\
Exec specifies the action to take. Used by health probe. An example of this
argument would be ["cat", "/tmp/healthy"].
""",
)
parser.add_argument(
'--container-health-probe-period-seconds',
type=int,
help="""\
How often (in seconds) to perform the health probe. Default to 10 seconds.
Minimum value is 1.
""",
)
parser.add_argument(
'--container-health-probe-timeout-seconds',
type=int,
help="""\
Number of seconds after which the health probe times out. Defaults to 1 second.
Minimum value is 1.
""",
)
def Run(self, args):
is_custom_weights_model = args.model.startswith('gs://')
if not is_custom_weights_model:
validation.ValidateModelGardenModelArgs(args)
validation.ValidateDisplayName(args.endpoint_display_name)
region_ref = args.CONCEPTS.region.Parse()
args.region = region_ref.AsDict()['locationsId']
version = constants.BETA_VERSION
is_hf_model = '@' not in args.model
with endpoint_util.AiplatformEndpointOverrides(
version, region='us-central1'
):
# Custom weights model deployment.
if is_custom_weights_model:
if not (
bool(args.machine_type)
== bool(args.accelerator_type)
== bool(args.accelerator_count)
):
raise c_exceptions.InvalidArgumentException(
'--machine-type, --accelerator-type and --accelerator-count',
' Arguments for MachineType, AcceleratorType and AcceleratorCount'
' must either all be provided or all be empty for custom weights'
' model deployment.',
)
machine_spec = None
# Check accelerator quota.
if args.machine_type:
model_garden_utils.CheckAcceleratorQuota(
args,
machine_type=args.machine_type,
accelerator_type=args.accelerator_type,
accelerator_count=args.accelerator_count,
)
client = apis.GetClientInstance(
constants.AI_PLATFORM_API_NAME,
constants.AI_PLATFORM_API_VERSION[version],
)
machine_spec = client.MESSAGES_MODULE.GoogleCloudAiplatformV1beta1MachineSpec(
machineType=args.machine_type,
acceleratorType=client.MESSAGES_MODULE.GoogleCloudAiplatformV1beta1MachineSpec.AcceleratorTypeValueValuesEnum(
args.accelerator_type
),
acceleratorCount=args.accelerator_count,
)
# Deploy the model.
with endpoint_util.AiplatformEndpointOverrides(
version, region=args.region
):
default_endpoint_name = '-'.join([
'custom-weights',
str(time.time()).split('.')[0],
'mg-cli-deploy',
])
mg_client = client_mg.ModelGardenClient()
operation_client = operations.OperationsClient(version=version)
endpoint_name = (
args.endpoint_display_name
if args.endpoint_display_name
else default_endpoint_name
)
model_garden_utils.Deploy(
args,
machine_spec,
endpoint_name,
args.model,
operation_client,
mg_client,
)
else:
# Model Garden model deployment.
# Step 1: Fetch PublisherModel data, including deployment configs. Use
# us-central1 because all data are stored in us-central1.
mg_client = client_mg.ModelGardenClient()
if is_hf_model:
# Convert to lower case because API only takes in lower case.
publisher_name, model_name = args.model.lower().split('/')
try:
publisher_model = mg_client.GetPublisherModel(
model_name=f'publishers/{publisher_name}/models/{model_name}',
is_hugging_face_model=True,
)
except apitools_exceptions.HttpNotFoundError:
raise c_exceptions.UnknownArgumentException(
'--model',
f'{args.model} is not a supported Hugging Face'
' model for deployment in Model Garden.',
)
default_endpoint_name = '-'.join(
[publisher_name, model_name, 'hf', 'mg-cli-deploy']
)
api_model_arg = f'{publisher_name}/{model_name}'
else:
# Convert to lower case because API only takes in lower case.
publisher_name, model_and_version_name = args.model.lower().split('/')
try:
publisher_model = mg_client.GetPublisherModel(
f'publishers/{publisher_name}/models/{model_and_version_name}'
)
except apitools_exceptions.HttpNotFoundError:
raise c_exceptions.UnknownArgumentException(
'--model',
f'{args.model} is not a supported Model Garden model for'
' deployment in Model Garden.',
)
default_endpoint_name = '-'.join([
publisher_name,
model_and_version_name.split('@')[1],
'mg-cli-deploy',
])
api_model_arg = (
f'publishers/{publisher_name}/models/{model_and_version_name}'
)
deploy_config = model_garden_utils.GetDeployConfig(
args, publisher_model
)
# Step 2: Check accelerator quota.
model_garden_utils.CheckAcceleratorQuota(
args,
machine_type=deploy_config.dedicatedResources.machineSpec.machineType,
accelerator_type=str(
deploy_config.dedicatedResources.machineSpec.acceleratorType
),
accelerator_count=deploy_config.dedicatedResources.machineSpec.acceleratorCount,
)
# Clear the aiplatform URI value so that new values can be set.
properties.VALUES.api_endpoint_overrides.aiplatform.Set(None)
# Step 3: Deploy the model.
with endpoint_util.AiplatformEndpointOverrides(
version, region=args.region
):
mg_client = client_mg.ModelGardenClient()
operation_client = operations.OperationsClient(version=version)
endpoint_name = (
args.endpoint_display_name
if args.endpoint_display_name
else default_endpoint_name
)
model_garden_utils.Deploy(
args,
deploy_config.dedicatedResources.machineSpec,
endpoint_name,
api_model_arg,
operation_client,
mg_client,
)