File: //snap/google-cloud-cli/394/platform/bq/frontend/command_mkdef.py
#!/usr/bin/env python
"""The BigQuery CLI mkdef command."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import sys
from typing import Optional
from absl import flags
from frontend import bigquery_command
from frontend import flags as frontend_flags
from frontend import utils as frontend_utils
# These aren't relevant for user-facing docstrings:
# pylint: disable=g-doc-return-or-yield
# pylint: disable=g-doc-args
class MakeExternalTableDefinition(bigquery_command.BigqueryCmd):
usage = """mkdef <source_uri> [<schema>]"""
def __init__(self, name: str, fv: flags.FlagValues):
super(MakeExternalTableDefinition, self).__init__(name, fv)
flags.DEFINE_boolean(
'autodetect',
None,
'Should schema and format options be autodetected.',
flag_values=fv,
)
flags.DEFINE_boolean(
'ignore_unknown_values',
None,
'Ignore any values in a row that are not present in the schema.',
short_name='i',
flag_values=fv,
)
flags.DEFINE_string(
'hive_partitioning_mode',
None,
'Enables hive partitioning. AUTO indicates to perform '
'automatic type inference. STRINGS indicates to treat all hive '
'partition keys as STRING typed. No other values are accepted',
flag_values=fv,
)
flags.DEFINE_string(
'hive_partitioning_source_uri_prefix',
None,
'Prefix after which hive partition '
'encoding begins. For URIs like gs://bucket/path/key1=value/file, '
'the value should be gs://bucket/path.',
flag_values=fv,
)
flags.DEFINE_boolean(
'require_hive_partition_filter',
None,
'Whether queries against a table are required to '
'include a hive partition key in a query predicate.',
flag_values=fv,
)
flags.DEFINE_enum(
'source_format',
'CSV',
[
'CSV',
'GOOGLE_SHEETS',
'NEWLINE_DELIMITED_JSON',
'DATASTORE_BACKUP',
'DELTA_LAKE',
'ORC',
'PARQUET',
'AVRO',
'ICEBERG',
],
'Format of source data. Options include:'
'\n CSV'
'\n GOOGLE_SHEETS'
'\n NEWLINE_DELIMITED_JSON'
'\n DATASTORE_BACKUP'
'\n DELTA_LAKE'
'\n ORC'
'\n PARQUET'
'\n ICEBERG'
'\n AVRO',
flag_values=fv,
)
flags.DEFINE_string(
'connection_id',
None,
'The connection specifying the credentials to be used to read external '
'storage, such as Azure Blob, Cloud Storage, or S3. The connection_id '
'can have the form "<project_id>.<location_id>.<connection_id>" or '
'"projects/<project_id>/locations/<location_id>/connections/'
'<connection_id>".',
flag_values=fv,
)
flags.DEFINE_boolean(
'use_avro_logical_types',
True,
'If sourceFormat is set to "AVRO", indicates whether to enable '
'interpreting logical types into their corresponding types '
'(ie. TIMESTAMP), instead of only using their raw types (ie. INTEGER).',
flag_values=fv,
)
flags.DEFINE_boolean(
'parquet_enum_as_string',
False,
'Infer Parquet ENUM logical type as STRING '
'(instead of BYTES by default).',
flag_values=fv,
)
flags.DEFINE_boolean(
'parquet_enable_list_inference',
False,
frontend_utils.PARQUET_LIST_INFERENCE_DESCRIPTION,
flag_values=fv,
)
flags.DEFINE_enum(
'metadata_cache_mode',
None,
['AUTOMATIC', 'MANUAL'],
'Enables metadata cache for an external table with a connection. '
'Specify AUTOMATIC to automatically refresh the cached metadata. '
'Specify MANUAL to stop the automatic refresh.',
flag_values=fv,
)
flags.DEFINE_enum(
'object_metadata',
None,
['DIRECTORY', 'SIMPLE'],
'Object Metadata Type. Options include:\n SIMPLE.',
flag_values=fv,
)
flags.DEFINE_boolean(
'preserve_ascii_control_characters',
False,
'Whether to preserve embedded Ascii Control characters in CSV External '
'table ',
flag_values=fv,
)
flags.DEFINE_string(
'reference_file_schema_uri',
None,
'provide a referencing file with the expected table schema, currently '
'enabled for the formats: AVRO, PARQUET, ORC.',
flag_values=fv,
)
flags.DEFINE_enum(
'encoding',
None,
['UTF-8', 'ISO-8859-1', 'UTF-16BE', 'UTF-16LE', 'UTF-32BE', 'UTF-32LE'],
'The character encoding used by the input file. Options include:'
'\n ISO-8859-1 (also known as Latin-1)'
'\n UTF-8'
'\n UTF-16BE (UTF-16 BigEndian)'
'\n UTF-16LE (UTF-16 LittleEndian)'
'\n UTF-32BE (UTF-32 BigEndian)'
'\n UTF-32LE (UTF-16 LittleEndian)',
short_name='E',
flag_values=fv,
)
flags.DEFINE_enum(
'file_set_spec_type',
None,
['FILE_SYSTEM_MATCH', 'NEW_LINE_DELIMITED_MANIFEST'],
'Specifies how to discover files given source URIs. '
'Options include: '
'\n FILE_SYSTEM_MATCH: expand source URIs by listing files from the '
'underlying object store. This is the default behavior.'
'\n NEW_LINE_DELIMITED_MANIFEST: indicate the source URIs provided are '
'new line delimited manifest files, where each line contains a URI '
'with no wild-card.',
flag_values=fv,
)
self.null_marker_flag = frontend_flags.define_null_marker(flag_values=fv)
self.null_markers_flag = frontend_flags.define_null_markers(flag_values=fv)
self.time_zone_flag = frontend_flags.define_time_zone(flag_values=fv)
self.date_format_flag = frontend_flags.define_date_format(flag_values=fv)
self.datetime_format_flag = frontend_flags.define_datetime_format(
flag_values=fv
)
self.time_format_flag = frontend_flags.define_time_format(flag_values=fv)
self.timestamp_format_flag = frontend_flags.define_timestamp_format(
flag_values=fv
)
self.source_column_match_flag = frontend_flags.define_source_column_match(
flag_values=fv
)
self.parquet_map_target_type_flag = (
frontend_flags.define_parquet_map_target_type(flag_values=fv)
)
self._ProcessCommandRc(fv)
def RunWithArgs(
self, source_uris: str, schema: Optional[str] = None
) -> Optional[int]:
"""Emits a definition in JSON for an external table, such as GCS.
The output of this command can be redirected to a file and used for the
external_table_definition flag with the "bq query" and "bq mk" commands.
It produces a definition with the most commonly used values for options.
You can modify the output to override option values.
The <source_uris> argument is a comma-separated list of URIs indicating
the data referenced by this external table.
The <schema> argument should be either the name of a JSON file or a text
schema.
In the case that the schema is provided in text form, it should be a
comma-separated list of entries of the form name[:type], where type will
default to string if not specified.
In the case that <schema> is a filename, it should be a JSON file
containing a single array, each entry of which should be an object with
properties 'name', 'type', and (optionally) 'mode'. For more detail:
https://cloud.google.com/bigquery/docs/schemas#specifying_a_json_schema_file
Note: the case of a single-entry schema with no type specified is
ambiguous; one can use name:string to force interpretation as a
text schema.
Usage:
mkdef <source_uris> [<schema>]
Examples:
bq mkdef 'gs://bucket/file.csv' field1:integer,field2:string
Arguments:
source_uris: Comma-separated list of URIs.
schema: Either a text schema or JSON file, as above.
"""
# pylint: disable=line-too-long
json.dump(
frontend_utils.CreateExternalTableDefinition(
source_format=self.source_format,
source_uris=source_uris,
schema=schema,
autodetect=self.autodetect,
connection_id=self.connection_id,
ignore_unknown_values=self.ignore_unknown_values,
hive_partitioning_mode=self.hive_partitioning_mode,
hive_partitioning_source_uri_prefix=self.hive_partitioning_source_uri_prefix,
require_hive_partition_filter=self.require_hive_partition_filter,
use_avro_logical_types=self.use_avro_logical_types,
parquet_enum_as_string=self.parquet_enum_as_string,
parquet_enable_list_inference=self.parquet_enable_list_inference,
metadata_cache_mode=self.metadata_cache_mode,
object_metadata=self.object_metadata,
preserve_ascii_control_characters=self.preserve_ascii_control_characters,
reference_file_schema_uri=self.reference_file_schema_uri,
encoding=self.encoding,
file_set_spec_type=self.file_set_spec_type,
null_marker=self.null_marker_flag.value,
null_markers=self.null_markers_flag.value,
time_zone=self.time_zone_flag.value,
date_format=self.date_format_flag.value,
datetime_format=self.datetime_format_flag.value,
time_format=self.time_format_flag.value,
timestamp_format=self.timestamp_format_flag.value,
source_column_match=self.source_column_match_flag.value,
parquet_map_target_type=self.parquet_map_target_type_flag.value,
),
sys.stdout,
sort_keys=True,
indent=2,
)
# pylint: enable=line-too-long