HEX

File: //snap/google-cloud-cli/394/lib/googlecloudsdk/command_lib/spanner/split_file_parser.py
# -*- coding: utf-8 -*- #
# Copyright 2024 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Provides split file preprocessing for adding splits to a database."""

from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals

import csv
import io
import re

from apitools.base.py import extra_types
from googlecloudsdk.api_lib.util import apis
from googlecloudsdk.calliope import exceptions as c_exceptions
from googlecloudsdk.core.util import files


class SplitFileParser:
  r"""Parses a split file into a list of split points.

  The split file is expected to be in the format of:
  <ObjectType>[space]<ObjectName>[space](<Split Value>)
  <ObjectType>[space]<ObjectName>[space](<Split Value>)
  ...
  where ObjectType can be TABLE or INDEX.
  Each split point must be in a new line.
  Split value is expected to be a comma separated list of key parts.
    Split values should be surrounded by parenthesis like ()
    String values should be supplied in single quotes:'splitKeyPart'
    Boolean values should be one of: true/false
    INT64 and NUMERIC spanner datatype values should be supplied within
    single quotes values like string format: '123',
    '999999999999999999999999999.99'
    Other number values should be supplied without quotes: 1.287
    Timestamp values should be provided in the following format in single quote
    values: '2020-06-18T17:24:53Z'
    If the split value needs to have a comma, then that should be escaped by
    backslash.

    Examples:
    TABLE Singers ('c32ca57a-786c-2268-09d4-95182a9930be')
    INDEX Order (4.2)
    TABLE TableD  (0,'7ef9db22-d0e5-6041-8937-4bc6a7ef9db2')
    INDEX IndexXYZ ('8762203435012030000',NULL,NULL)
    INDEX IndexABC  (0, '2020-06-18T17:24:53Z') TableKey (123,'ab\,c')
    -- note that the above split value has a delimieter (comma) in it,
        hence escaped by a backslash.
  """

  def __init__(self, splits_file, split_expiration_date):
    self.splits_file = splits_file
    self.split_expiration_date = split_expiration_date
    self.split_line_pattern = re.compile(r'(\S+)\s+(\S+)\s+(.+)')
    self.incorrect_split_with_table_key_pattern = re.compile(
        r'\((.*?)\) TABLE (\S+)\s+\((.*?)\)$'
    )
    self.incorrect_split_with_index_key_pattern = re.compile(
        r'\((.*?)\) INDEX (\S+)\s+\((.*?)\)$'
    )
    self.index_full_key_pattern = re.compile(r'\((.*?)\) TableKey \((.*?)\)$')
    self.single_key_pattern = re.compile(r'\((.*?)\)$')

  def Process(self):
    """Gets the split points from the input file."""
    msgs = apis.GetMessagesModule('spanner', 'v1')
    split_points_list = []
    with files.FileReader(self.splits_file) as file:
      for single_split_string in file.read().splitlines():
        single_split = self.ParseSplitPointString(single_split_string)
        if (
            not single_split
            or not single_split['SplitValue']
            or not single_split['ObjectName']
            or not single_split['ObjectType']
            or single_split['ObjectType'].upper() not in ['TABLE', 'INDEX']
        ):
          raise c_exceptions.InvalidArgumentException(
              '--splits-file',
              'Invalid split point string: {}. Each split point must be in the'
              ' format of <ObjectType> <ObjectName> (<Split Value>) where'
              ' ObjectType can be TABLE or INDEX'.format(single_split_string),
          )
        split = msgs.SplitPoints()
        if single_split['ObjectType'].upper() == 'TABLE':
          split.table = single_split['ObjectName']
        elif single_split['ObjectType'].upper() == 'INDEX':
          split.index = single_split['ObjectName']

        if single_split['SplitValue']:
          split.keys = self.ParseSplitValue(single_split['SplitValue'])

        if self.split_expiration_date:
          split.expireTime = self.split_expiration_date
        split_points_list.append(split)
    return split_points_list

  def ParseSplitPointString(self, input_string):
    """Parses a string in the format "<ObjectType> <ObjectName> (<Split Value>)".

    and returns a dictionary with the extracted information.

    Args:
      input_string: The string to parse.

    Returns:
      A dictionary with keys "ObjectType", "ObjectName", and "SplitValue",
      or None if the input string is not in the expected format.
    """
    # Matches three groups of non-whitespace characters separated by spaces
    match = self.split_line_pattern.match(input_string)
    if match:
      return {
          'ObjectType': match.group(1),
          'ObjectName': match.group(2),
          'SplitValue': match.group(3)
      }
    else:
      raise c_exceptions.InvalidArgumentException(
          '--splits-file',
          'Invalid split point string: {}. Each split point must be in the'
          ' format of <ObjectType> <ObjectName> (<Split Value>) where'
          ' ObjectType can be TABLE or INDEX'.format(input_string),
      )

  def ParseSplitValue(self, input_string):
    """Parses a string in the format "(CommaSeparatedKeyParts) TableKey (CommaSeparatedKeyParts)".

    and returns a dictionary with the extracted information.

    Args:
      input_string: The string to parse.

    Returns:
      A split point key.
    """
    msgs = apis.GetMessagesModule('spanner', 'v1')
    keys_all = []
    input_string = input_string.strip()
    # Catches the case when single line contains multiple split points.
    if self.incorrect_split_with_table_key_pattern.match(
        input_string
    ) or self.incorrect_split_with_index_key_pattern.match(input_string):
      raise c_exceptions.InvalidArgumentException(
          '--splits-file',
          'Invalid split point string: {}. Each line must contain a single'
          ' split point for a table or index.'.format(input_string),
      )

    all_keys_strings = []
    match = self.index_full_key_pattern.match(input_string)
    if match:
      # Index split with full key
      all_keys_strings.append(match.group(1))
      all_keys_strings.append(match.group(2))
    else:
      match = self.single_key_pattern.match(input_string)
      if match:
        all_keys_strings.append(match.group(1))
      else:
        raise c_exceptions.InvalidArgumentException(
            '--splits-file',
            'The split value must be surrounded by parenthesis.',
        )
    for input_string_per_key in all_keys_strings:
      input_string_per_key = input_string_per_key.strip()
      input_string_per_key = input_string_per_key.strip('()')
      single_key = msgs.Key()
      for split_token in self.TokenizeWithCsv(input_string_per_key):
        key_parts = extra_types.JsonValue()
        if split_token == 'NULL':
          key_parts.is_null = True
        else:
          if (
              split_token == 'true'
              or split_token == 'false'
              or split_token == 'TRUE'
              or split_token == 'FALSE'
          ):
            key_parts.boolean_value = bool(split_token.lower())
          else:
            if split_token.find('\'') == -1:
              key_parts.double_value = float(split_token)
            else:
              key_parts.string_value = split_token.strip('\'')
        single_key.keyParts.append(key_parts)
      keys_all.append(single_key)
    return keys_all

  def TokenizeWithCsv(self, text):
    """Tokenizes text using commas as delimiters, ignoring commas within single quotes.

    Args:
      text: The text to tokenize.

    Returns:
      A list of tokens.
    """
    reader = csv.reader(
        io.StringIO(text),
        quotechar="'",
        skipinitialspace=True, quoting=csv.QUOTE_NONE,
        escapechar='\\'
    )
    return next(reader)


def ParseSplitPoints(args):
  """Gets the split points from the input file."""
  return SplitFileParser(args.splits_file, args.split_expiration_date).Process()