File: //snap/google-cloud-cli/396/lib/googlecloudsdk/command_lib/dataproc/jobs/pyspark.py
# -*- coding: utf-8 -*- #
# Copyright 2015 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Base class for PySpark Job."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import argparse
from apitools.base.py import encoding
from googlecloudsdk.calliope import arg_parsers
from googlecloudsdk.command_lib.dataproc.jobs import base as job_base
from googlecloudsdk.command_lib.dataproc.jobs import util as job_util
class PySparkBase(job_base.JobBase):
  """Submit a PySpark job to a cluster."""
  @staticmethod
  def Args(parser):
    """Performs command-line argument parsing specific to PySpark."""
    parser.add_argument(
        'py_file',
        help='Main .py file to run as the driver.')
    parser.add_argument(
        '--py-files',
        type=arg_parsers.ArgList(),
        metavar='PY_FILE',
        default=[],
        help=('Comma separated list of Python files to be provided to the job. '
              'Must be one of the following file formats '
              '".py, .zip, or .egg".'))
    parser.add_argument(
        '--jars',
        type=arg_parsers.ArgList(),
        metavar='JAR',
        default=[],
        help=('Comma separated list of jar files to be provided to the '
              'executor and driver classpaths.'))
    parser.add_argument(
        '--files',
        type=arg_parsers.ArgList(),
        metavar='FILE',
        default=[],
        help=('Comma separated list of files to be placed in the working '
              'directory of both the app driver and executors.'))
    parser.add_argument(
        '--archives',
        type=arg_parsers.ArgList(),
        metavar='ARCHIVE',
        default=[],
        help=(
            'Comma separated list of archives to be extracted into the working '
            'directory of each executor. '
            'Must be one of the following file formats: .zip, .tar, .tar.gz, '
            'or .tgz.'))
    parser.add_argument(
        'job_args',
        nargs=argparse.REMAINDER,
        help='Arguments to pass to the driver.')
    parser.add_argument(
        '--properties',
        type=arg_parsers.ArgDict(),
        metavar='PROPERTY=VALUE',
        help='List of key value pairs to configure PySpark. For a list of '
             'available properties, see: '
             'https://spark.apache.org/docs/latest/'
             'configuration.html#available-properties.')
    parser.add_argument(
        '--properties-file',
        help=job_util.PROPERTIES_FILE_HELP_TEXT)
    parser.add_argument(
        '--driver-log-levels',
        type=arg_parsers.ArgDict(),
        metavar='PACKAGE=LEVEL',
        help=('List of key value pairs to configure driver logging, where key '
              'is a package and value is the log4j log level. For '
              'example: root=FATAL,com.example=INFO'))
  @staticmethod
  def GetFilesByType(args):
    return {
        'py_file': args.py_file,
        'py_files': args.py_files,
        'archives': args.archives,
        'files': args.files,
        'jars': args.jars}
  @staticmethod
  def ConfigureJob(messages, job, files_by_type, logging_config, args):
    """Populates the pysparkJob member of the given job."""
    pyspark_job = messages.PySparkJob(
        args=args.job_args or [],
        archiveUris=files_by_type['archives'],
        fileUris=files_by_type['files'],
        jarFileUris=files_by_type['jars'],
        pythonFileUris=files_by_type['py_files'],
        mainPythonFileUri=files_by_type['py_file'],
        loggingConfig=logging_config,
    )
    job_properties = job_util.BuildJobProperties(
        args.properties, args.properties_file
    )
    if job_properties:
      # Sort properties to ensure tests comparing messages not fail on ordering.
      pyspark_job.properties = encoding.DictToAdditionalPropertyMessage(
          job_properties, messages.PySparkJob.PropertiesValue, sort_items=True
      )
    job.pysparkJob = pyspark_job