HEX

File: //snap/google-cloud-cli/current/lib/googlecloudsdk/command_lib/util/glob.py
# -*- coding: utf-8 -*- #
# Copyright 2017 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Library for ignoring files for upload.

This library very closely mimics the semantics of Git's gitignore file:
https://git-scm.com/docs/gitignore

See `gcloud topic gcloudignore` for details.

A typical use would be:

  file_chooser = gcloudignore.GetFileChooserForDir(upload_directory)
  for f in file_chooser.GetIncludedFiles('some/path'):
    print 'uploading {}'.format(f)
    # actually do the upload, too
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals

import fnmatch
import os
import re

_GCLOUDIGNORE_PATH_SEP = '/'
_ENDS_IN_ODD_NUMBER_SLASHES_RE = r'(?<!\\)\\(\\\\)*$'


class InternalParserError(Exception):
  """An internal error in gcloudignore parsing."""


class InvalidLineError(InternalParserError):
  """Error indicating that a line of the ignore file was invalid."""


def _HandleSpaces(line):
  """Handles spaces in a line.

  In particular, deals with trailing spaces (which are stripped unless
  escaped) and escaped spaces throughout the line (which are unescaped).

  Args:
    line: str, the line

  Returns:
    str, the line with spaces handled
  """
  def _Rstrip(line):
    """Strips unescaped trailing spaces."""
    # First, make the string into "tokens": a backslash followed by any
    # character is one token; any other character is a token on its own.
    tokens = []
    i = 0
    while i < len(line):
      curr = line[i]
      if curr == '\\':
        if i + 1 >= len(line):
          tokens.append(curr)
          break  # Pass through trailing backslash
        tokens.append(curr + line[i+1])
        i += 2
      else:
        tokens.append(curr)
        i += 1

    # Then, strip the trailing space tokens.
    res = []
    only_seen_spaces = True
    for curr in reversed(tokens):
      if only_seen_spaces and curr == ' ':
        continue
      only_seen_spaces = False
      res.append(curr)

    return ''.join(reversed(res))

  def _UnescapeSpaces(line):
    """Unescapes all spaces in a line."""
    return line.replace('\\ ', ' ')

  return _UnescapeSpaces(_Rstrip(line))


def _Unescape(line):
  r"""Unescapes a line.

  The escape character is '\'. An escaped backslash turns into one backslash;
  any other escaped character is ignored.

  Args:
    line: str, the line to unescape

  Returns:
    str, the unescaped line

  """
  return re.sub(r'\\([^\\])', r'\1', line).replace('\\\\', '\\')


def GetPathPrefixes(path):
  """Returns all prefixes for the given path, inclusive.

  That is, for 'foo/bar/baz', returns ['', 'foo', 'foo/bar', 'foo/bar/baz'].

  Args:
    path: str, the path for which to get prefixes.

  Returns:
    list of str, the prefixes.
  """
  path_prefixes = [path]
  path_reminder = True
  # Apparently which one is empty when operating on top-level directory depends
  # on your configuration.
  while path and path_reminder:
    path, path_reminder = os.path.split(path)
    path_prefixes.insert(0, path)
  return path_prefixes


class Glob(object):
  """A file-matching glob pattern.

  See https://git-scm.com/docs/gitignore for full syntax specification.

  Attributes:
    pattern: str, a globbing pattern.
    must_be_dir: bool, true if only dirs match.
  """

  def __init__(self, pattern, must_be_dir=False):
    self.pattern = pattern
    self.must_be_dir = must_be_dir

  def _MatchesHelper(self, pattern_parts, path):
    """Determines whether the given pattern matches the given path.

    Args:
      pattern_parts: list of str, the list of pattern parts that must all match
        the path.
      path: str, the path to match.

    Returns:
      bool, whether the patch matches the pattern_parts (Matches() will convert
        this into a Match value).
    """
    # This method works right-to-left. It checks that the right-most pattern
    # part matches the right-most path part, and that the remaining pattern
    # matches the remaining path.
    if not pattern_parts:
      # We've reached the end of the pattern! Success.
      return True
    if path is None:
      # Distinguish between '*' and '/*'. The former should match '' (the root
      # directory) but the latter should not.
      return False

    pattern_part = pattern_parts[-1]
    remaining_pattern = pattern_parts[:-1]
    if path:  # normpath turns '' into '.', which confuses fnmatch later
      path = os.path.normpath(path)
    remaining_path, path_part = os.path.split(path)
    if not path_part:
      # See note above.
      remaining_path = None

    if pattern_part == '**':
      # If the path would match the remaining pattern_parts after skipping 0-all
      # of the trailing path parts, the whole pattern matches.
      #
      # That is, if we have `<pattern>/**` as a pattern and `foo/bar` as our
      # path, if any of ``, `foo`, and `foo/bar` match `<pattern>`, we return
      # true.
      path_prefixes = GetPathPrefixes(path)
      # '**' patterns only match against the full path (essentially, they have
      # an implicit '/' at the front of the pattern). An empty string at the
      # beginning of remaining_pattern simulates this.
      #
      # pylint: disable=g-explicit-bool-comparison
      # In this case, it's much clearer to show what we're checking for.
      if not (remaining_pattern and remaining_pattern[0] == ''):
        remaining_pattern.insert(0, '')
      # pylint: enable=g-explicit-bool-comparison
      return any(self._MatchesHelper(remaining_pattern, prefix) for prefix
                 in path_prefixes)

    if pattern_part == '*' and not remaining_pattern:
      # We need to ensure that a '*' at the beginning of a pattern does not
      # match a part with a '/' in it. That should only happen when '**' is
      # used.
      # For example: '*/bar' should match 'foo/bar', but not 'foo/qux/bar'.
      if remaining_path and len(remaining_path) > 1:
        return False

    if not fnmatch.fnmatch(path_part, pattern_part):
      # If the current pattern part doesn't match the current path part, the
      # whole pattern can't match the whole path. Give up!
      return False

    return self._MatchesHelper(remaining_pattern, remaining_path)

  def Matches(self, path, is_dir=False):
    """Returns a Match for this pattern and the given path."""
    if self.must_be_dir and not is_dir:
      return False
    if self._MatchesHelper(self.pattern.split('/'), path):
      return True
    else:
      return False

  @classmethod
  def FromString(cls, line):
    """Creates a pattern for an individual line of an ignore file.

    Windows-style newlines must be removed.

    Args:
      line: str, The line to parse.

    Returns:
      Pattern.

    Raises:
      InvalidLineError: if the line was invalid (comment, blank, contains
        invalid consecutive stars).
    """
    if line.endswith('/'):
      line = line[:-1]
      must_be_dir = True
    else:
      must_be_dir = False
    line = _HandleSpaces(line)
    if re.search(_ENDS_IN_ODD_NUMBER_SLASHES_RE, line):
      raise InvalidLineError(
          'Line [{}] ends in an odd number of [\\]s.'.format(line))
    line = _Unescape(line)
    if not line:
      raise InvalidLineError('Line [{}] is blank.'.format(line))
    return cls(line, must_be_dir=must_be_dir)