HEX

File: //snap/google-cloud-cli/396/lib/googlecloudsdk/command_lib/ai/custom_jobs/flags.py
# -*- coding: utf-8 -*- #
# Copyright 2021 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Flags definition specifically for gcloud ai custom-jobs."""

from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals

import argparse
import textwrap

from googlecloudsdk.calliope import arg_parsers
from googlecloudsdk.calliope import base
from googlecloudsdk.calliope.concepts import concepts
from googlecloudsdk.command_lib.ai import constants
from googlecloudsdk.command_lib.ai import flags as shared_flags
from googlecloudsdk.command_lib.ai import region_util
from googlecloudsdk.command_lib.ai.custom_jobs import custom_jobs_util
from googlecloudsdk.command_lib.util.args import labels_util
from googlecloudsdk.command_lib.util.concepts import concept_parsers

_DISPLAY_NAME = base.Argument(
    '--display-name',
    required=True,
    help=('Display name of the custom job to create.'))

_PYTHON_PACKAGE_URIS = base.Argument(
    '--python-package-uris',
    metavar='PYTHON_PACKAGE_URIS',
    type=arg_parsers.ArgList(),
    help=('The common Python package URIs to be used for training with a '
          'pre-built container image. e.g. `--python-package-uri=path1,path2` '
          'If you are using multiple worker pools and want to specify a '
          'different Python packag fo reach pool, use `--config` instead.'))

_CUSTOM_JOB_CONFIG = base.Argument(
    '--config',
    help=textwrap.dedent("""\
      Path to the job configuration file. This file should be a YAML document
      containing a [`CustomJobSpec`](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec).
      If an option is specified both in the configuration file **and** via command-line arguments, the command-line arguments
      override the configuration file. Note that keys with underscore are invalid.

      Example(YAML):

        workerPoolSpecs:
          machineSpec:
            machineType: n1-highmem-2
          replicaCount: 1
          containerSpec:
            imageUri: gcr.io/ucaip-test/ucaip-training-test
            args:
            - port=8500
            command:
            - start"""))

_WORKER_POOL_SPEC = base.Argument(
    '--worker-pool-spec',
    action='append',
    type=arg_parsers.ArgDict(
        spec={
            'replica-count': int,
            'machine-type': str,
            'accelerator-type': str,
            'accelerator-count': int,
            'container-image-uri': str,
            'executor-image-uri': str,
            'output-image-uri': str,
            'python-module': str,
            'script': str,
            'local-package-path': str,
            'requirements': arg_parsers.ArgList(custom_delim_char=';'),
            'extra-dirs': arg_parsers.ArgList(custom_delim_char=';'),
            'extra-packages': arg_parsers.ArgList(custom_delim_char=';'),
        }),
    metavar='WORKER_POOL_SPEC',
    help=textwrap.dedent("""\
      Define the worker pool configuration used by the custom job. You can
      specify multiple worker pool specs in order to create a custom job with
      multiple worker pools.

      The spec can contain the following fields:

      *machine-type*::: (Required): The type of the machine.
        see https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types
        for supported types. This is corresponding to the `machineSpec.machineType`
        field in `WorkerPoolSpec` API message.
      *replica-count*::: The number of worker replicas to use for this worker
        pool, by default the value is 1. This is corresponding to the `replicaCount`
        field in `WorkerPoolSpec` API message.
      *accelerator-type*::: The type of GPUs.
        see https://cloud.google.com/vertex-ai/docs/training/configure-compute#specifying_gpus
        for more requirements. This is corresponding to the `machineSpec.acceleratorType`
        field in `WorkerPoolSpec` API message.
      *accelerator-count*::: The number of GPUs for each VM in the worker pool to
        use, by default the value if 1. This is corresponding to the
        `machineSpec.acceleratorCount` field in `WorkerPoolSpec` API message.
      *container-image-uri*::: The URI of a container image to be directly run on
        each worker replica. This is corresponding to the
        `containerSpec.imageUri` field in `WorkerPoolSpec` API message.
      *executor-image-uri*::: The URI of a container image that will run the
        provided package.
      *output-image-uri*::: The URI of a custom container image to be built for
      autopackaged custom jobs.
      *python-module*::: The Python module name to run within the provided
        package.
      *local-package-path*::: The local path of a folder that contains training
        code.
      *script*::: The relative path under the `local-package-path` to a file to
        execute. It can be a Python file or an arbitrary bash script.
      *requirements*::: Python dependencies to be installed from PyPI, separated
        by ";". This is supposed to be used when some public packages are
        required by your training application but not in the base images.
        It has the same effect as editing a "requirements.txt" file under
        `local-package-path`.
      *extra-packages*::: Relative paths of local Python archives to be installed,
        separated by ";". This is supposed to be used when some custom packages
        are required by your training application but not in the base images.
        Every path should be relative to the `local-package-path`.
      *extra-dirs*::: Relative paths of the folders under `local-package-path`
       to be copied into the container, separated by ";". If not specified, only
       the parent directory that contains the main executable (`script` or
       `python-module`) will be copied.


      ::::
      Note that some of these fields are used for different job creation methods
      and are categorized as mutually exclusive groups listed below. Exactly one of
      these groups of fields must be specified:


      `container-image-uri`::::
      Specify this field to use a custom container image for training. Together
      with the `--command` and `--args` flags, this field represents a
      [`WorkerPoolSpec.ContainerSpec`](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec?#containerspec)
      message.
      In this case, the `--python-package-uris` flag is disallowed.

      Example:
      --worker-pool-spec=replica-count=1,machine-type=n1-highmem-2,container-image-uri=gcr.io/ucaip-test/ucaip-training-test

      `executor-image-uri, python-module`::::
      Specify these fields to train using a pre-built container and Python
      packages that are already in Cloud Storage. Together with the
      `--python-package-uris` and `--args` flags, these fields represent a
      [`WorkerPoolSpec.PythonPackageSpec`](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#pythonpackagespec)
      message .

      Example:
      --worker-pool-spec=machine-type=e2-standard-4,executor-image-uri=us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-4:latest,python-module=trainer.task

      `output-image-uri`::::
      Specify this field to push the output custom container training image to a specific path in Container Registry or Artifact Registry for an autopackaged custom job.

      Example:
      --worker-pool-spec=machine-type=e2-standard-4,executor-image-uri=us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-4:latest,output-image-uri='eu.gcr.io/projectName/imageName',python-module=trainer.task

      `local-package-path, executor-image-uri, output-image-uri, python-module|script`::::
      Specify these fields, optionally with `requirements`, `extra-packages`, or
      `extra-dirs`, to train using a pre-built container and Python code from a
      local path.
      In this case, the `--python-package-uris` flag is disallowed.

      Example using `python-module`:
      --worker-pool-spec=machine-type=e2-standard-4,replica-count=1,executor-image-uri=us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-4:latest,python-module=trainer.task,local-package-path=/usr/page/application

      Example using `script`:
      --worker-pool-spec=machine-type=e2-standard-4,replica-count=1,executor-image-uri=us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-4:latest,script=my_run.sh,local-package-path=/usr/jeff/application
      """))

_CUSTOM_JOB_COMMAND = base.Argument(
    '--command',
    type=arg_parsers.ArgList(),
    metavar='COMMAND',
    action=arg_parsers.UpdateAction,
    help="""\
    Command to be invoked when containers are started.
    It overrides the entrypoint instruction in Dockerfile when provided.
    """)
_CUSTOM_JOB_ARGS = base.Argument(
    '--args',
    metavar='ARG',
    type=arg_parsers.ArgList(),
    action=arg_parsers.UpdateAction,
    help='Comma-separated arguments passed to containers or python tasks.')

_PERSISTENT_RESOURCE_ID = base.Argument(
    '--persistent-resource-id',
    metavar='PERSISTENT_RESOURCE_ID',
    help="""\
    The name of the persistent resource from the same project and region on
    which to run this custom job.

    If this is specified, the job will be run on existing machines held by the
    PersistentResource instead of on-demand short-lived machines.
    The network and CMEK configs on the job should be consistent with those on
    the PersistentResource, otherwise, the job will be rejected.
    """)


def AddCreateCustomJobFlags(parser):
  """Adds flags related to create a custom job."""
  shared_flags.AddRegionResourceArg(
      parser,
      'to create a custom job',
      prompt_func=region_util.GetPromptForRegionFunc(
          constants.SUPPORTED_TRAINING_REGIONS))
  shared_flags.TRAINING_SERVICE_ACCOUNT.AddToParser(parser)
  shared_flags.NETWORK.AddToParser(parser)
  shared_flags.ENABLE_WEB_ACCESS.AddToParser(parser)
  shared_flags.ENABLE_DASHBOARD_ACCESS.AddToParser(parser)
  shared_flags.AddKmsKeyResourceArg(parser, 'custom job')

  labels_util.AddCreateLabelsFlags(parser)

  _DISPLAY_NAME.AddToParser(parser)
  _PYTHON_PACKAGE_URIS.AddToParser(parser)
  _CUSTOM_JOB_ARGS.AddToParser(parser)
  _CUSTOM_JOB_COMMAND.AddToParser(parser)
  _PERSISTENT_RESOURCE_ID.AddToParser(parser)

  worker_pool_spec_group = base.ArgumentGroup(
      help='Worker pool specification.', required=True)
  worker_pool_spec_group.AddArgument(_CUSTOM_JOB_CONFIG)
  worker_pool_spec_group.AddArgument(_WORKER_POOL_SPEC)
  worker_pool_spec_group.AddToParser(parser)


def AddCustomJobResourceArg(parser,
                            verb,
                            regions=constants.SUPPORTED_TRAINING_REGIONS):
  """Add a resource argument for a Vertex AI custom job.

  NOTE: Must be used only if it's the only resource arg in the command.

  Args:
    parser: the parser for the command.
    verb: str, the verb to describe the job resource, such as 'to update'.
    regions: list[str], the list of supported regions.
  """
  job_resource_spec = concepts.ResourceSpec(
      resource_collection=custom_jobs_util.CUSTOM_JOB_COLLECTION,
      resource_name='custom job',
      locationsId=shared_flags.RegionAttributeConfig(
          prompt_func=region_util.GetPromptForRegionFunc(regions)),
      projectsId=concepts.DEFAULT_PROJECT_ATTRIBUTE_CONFIG,
      disable_auto_completers=False)

  concept_parsers.ConceptParser.ForResource(
      'custom_job',
      job_resource_spec,
      'The custom job {}.'.format(verb),
      required=True).AddToParser(parser)


def AddLocalRunCustomJobFlags(parser):
  """Add local-run related flags to the parser."""

  # Flags for entry point of the training application
  application_group = parser.add_mutually_exclusive_group()
  application_group.add_argument(
      '--python-module',
      metavar='PYTHON_MODULE',
      help=textwrap.dedent("""
      Name of the python module to execute, in 'trainer.train' or 'train'
      format. Its path should be relative to the `work_dir`.
      """))
  application_group.add_argument(
      '--script',
      metavar='SCRIPT',
      help=textwrap.dedent("""
      The relative path of the file to execute. Accepets a Python file or an
      arbitrary bash script. This path should be relative to the `work_dir`.
      """))

  # Flags for working directory.
  parser.add_argument(
      '--local-package-path',
      metavar='LOCAL_PATH',
      suggestion_aliases=['--work-dir'],
      help=textwrap.dedent("""
      local path of the directory where the python-module or script exists.
      If not specified, it use the directory where you run the this command.

      Only the contents of this directory will be accessible to the built
      container image.
      """))

  # Flags for extra directory
  parser.add_argument(
      '--extra-dirs',
      metavar='EXTRA_DIR',
      type=arg_parsers.ArgList(),
      help=textwrap.dedent("""
      Extra directories under the working directory to include, besides the one
      that contains the main executable.

      By default, only the parent directory of the main script or python module
      is copied to the container.
      For example, if the module is "training.task" or the script is
      "training/task.py", the whole "training" directory, including its
      sub-directories, will always be copied to the container. You may specify
      this flag to also copy other directories if necessary.

      Note: if no parent is specified in 'python_module' or 'scirpt', the whole
      working directory is copied, then you don't need to specify this flag.
      """))

  # Flags for base container image
  parser.add_argument(
      '--executor-image-uri',
      metavar='IMAGE_URI',
      required=True,
      suggestion_aliases=['--base-image'],
      help=textwrap.dedent("""
      URI or ID of the container image in either the Container Registry or local
      that will run the application.
      See https://cloud.google.com/vertex-ai/docs/training/pre-built-containers
      for available pre-built container images provided by Vertex AI for training.
      """))

  # Flags for extra requirements.
  parser.add_argument(
      '--requirements',
      metavar='REQUIREMENTS',
      type=arg_parsers.ArgList(),
      help=textwrap.dedent("""
      Python dependencies from PyPI to be used when running the application.
      If this is not specified, and there is no "setup.py" or "requirements.txt"
      in the working directory, your application will only have access to what
      exists in the base image with on other dependencies.

      Example:
      'tensorflow-cpu, pandas==1.2.0, matplotlib>=3.0.2'
      """))

  # Flags for extra dependency .
  parser.add_argument(
      '--extra-packages',
      metavar='PACKAGE',
      type=arg_parsers.ArgList(),
      help=textwrap.dedent("""
      Local paths to Python archives used as training dependencies in the image
      container.
      These can be absolute or relative paths. However, they have to be under
      the work_dir; Otherwise, this tool will not be able to access it.

      Example:
      'dep1.tar.gz, ./downloads/dep2.whl'
      """))

  # Flags for the output image
  parser.add_argument(
      '--output-image-uri',
      metavar='OUTPUT_IMAGE',
      help=textwrap.dedent("""
      Uri of the custom container image to be built with the your application
      packed in.
      """))

  # Flaga for GPU support
  parser.add_argument(
      '--gpu', action='store_true', default=False, help='Enable to use GPU.')

  # Flags for docker run
  parser.add_argument(
      '--docker-run-options',
      metavar='DOCKER_RUN_OPTIONS',
      hidden=True,
      type=arg_parsers.ArgList(),
      help=textwrap.dedent("""
      Custom Docker run options to pass to image during execution.
      For example, '--no-healthcheck, -a stdin'.

      See https://docs.docker.com/engine/reference/commandline/run/#options for
      more details.
      """))

  # Flags for service account
  parser.add_argument(
      '--service-account-key-file',
      metavar='ACCOUNT_KEY_FILE',
      help=textwrap.dedent("""
      The JSON file of a Google Cloud service account private key.
      When specified, the corresponding service account will be used to
      authenticate the local container to access Google Cloud services.
      Note that the key file won't be copied to the container, it will be
      mounted during running time.
      """))

  # User custom flags.
  parser.add_argument(
      'args',
      nargs=argparse.REMAINDER,
      default=[],
      help="""Additional user arguments to be forwarded to your application.""",
      example=('$ {command} --script=my_run.sh --base-image=gcr.io/my/image '
               '-- --my-arg bar --enable_foo'))