HEX

File: //snap/google-cloud-cli/394/lib/googlecloudsdk/command_lib/storage/expansion.py
# -*- coding: utf-8 -*- #
# Copyright 2016 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utilities for expanding wildcarded GCS pathnames."""

from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals

import abc
import fnmatch
import os
import re

from googlecloudsdk.api_lib.storage import storage_api
from googlecloudsdk.api_lib.storage import storage_util
from googlecloudsdk.core import log
from googlecloudsdk.core import properties

import six


class PathExpander(six.with_metaclass(abc.ABCMeta)):
  """Abstract base class for path wildcard expansion."""

  EXPANSION_CHARS = '[*?[]'

  @classmethod
  def ForPath(cls, path):
    if path.startswith('gs://'):
      return GCSPathExpander()
    return LocalPathExpander()

  def __init__(self, sep):
    self._sep = sep

  @abc.abstractmethod
  def AbsPath(self, path):
    pass

  @abc.abstractmethod
  def IsFile(self, path):
    pass

  @abc.abstractmethod
  def IsDir(self, path):
    pass

  @abc.abstractmethod
  def Exists(self, path):
    pass

  @abc.abstractmethod
  def ListDir(self, path):
    pass

  @abc.abstractmethod
  def Join(self, path1, path2):
    pass

  @classmethod
  def HasExpansion(cls, path):
    return bool(re.search(PathExpander.EXPANSION_CHARS, path))

  def ExpandPath(self, path):
    """Expand the given path that contains wildcard characters.

    Args:
      path: str, The path to expand.

    Returns:
      ({str}, {str}), A tuple of the sets of files and directories that match
      the wildcard path. All returned paths are absolute.
    """
    files = set()
    dirs = set()
    for p in self._Glob(self.AbsPath(path)):
      if p.endswith(self._sep):
        dirs.add(p)
      else:
        files.add(p)
    if self.IsEndRecursive(path):
      # If the path has /** on the end, it is going to match all files under
      # each matching root, so there is no need to process any sub-directories
      # explicitly.
      dirs.clear()
    return (files, dirs)

  def ExpandPaths(self, paths):
    files = set()
    dirs = set()
    for p in paths:
      (current_files, current_dirs) = self.ExpandPath(p)
      if not current_files and not current_dirs:
        log.warning('[{}] does not match any paths.'.format(p))
        continue
      files.update(current_files)
      dirs.update(current_dirs)
    return files, dirs

  def IsEndRecursive(self, path):
    return path.endswith(self._sep + '**')

  def IsDirLike(self, path):
    return path.endswith(self._sep)

  def _Glob(self, path):
    if not self.HasExpansion(path):
      if self.Exists(path):
        yield self._FormatPath(path)
      return

    dir_path, basename = os.path.split(path)
    has_basename_expansion = self.HasExpansion(basename)
    for expanded_dir_path in self._Glob(dir_path):
      if not has_basename_expansion:
        path = self.Join(expanded_dir_path, basename)
        if self.Exists(path):
          yield self._FormatPath(path)
      else:
        if basename == '**':
          for n in self._RecursiveDirList(expanded_dir_path):
            yield self._FormatPath(n)
        else:
          for n in fnmatch.filter(
              self.ListDir(expanded_dir_path),
              basename):
            yield self._FormatPath(self.Join(expanded_dir_path, n))

  def _RecursiveDirList(self, dir_path):
    for n in self.ListDir(dir_path):
      path = self.Join(dir_path, n)
      yield path
      for x in self._RecursiveDirList(path):
        yield x

  def _FormatPath(self, path):
    if self.IsDir(path) and not path.endswith(self._sep):
      path = path + self._sep
    return path


class LocalPathExpander(PathExpander):
  """Implements path expansion for the local filesystem."""

  def __init__(self):
    super(LocalPathExpander, self).__init__(os.sep)

  def AbsPath(self, path):
    return os.path.abspath(path)

  def IsFile(self, path):
    return os.path.isfile(path)

  def IsDir(self, path):
    return os.path.isdir(path)

  def Exists(self, path):
    return os.path.exists(path)

  def ListDir(self, path):
    try:
      return os.listdir(path)
    except os.error:
      return []

  def Join(self, path1, path2):
    return os.path.join(path1, path2)


class GCSPathExpander(PathExpander):
  """Implements path expansion for gs:// formatted resource strings."""

  def __init__(self):
    super(GCSPathExpander, self).__init__('/')
    self._client = storage_api.StorageClient()
    self._objects = {}
    self._object_details = {}

  def GetSortedObjectDetails(self, object_paths):
    """Gets all the details for the given paths and returns them sorted.

    Args:
      object_paths: [str], A list of gs:// object or directory paths.

    Returns:
      [{path, data}], A list of dicts with the keys path and data. Path is the
      gs:// path to the object or directory. Object paths will not end in a '/'
      and directory paths will. The data is either a storage.Object message (for
      objects) or a storage_util.ObjectReference for directories. The sort
      order is alphabetical with all directories first and then all objects.
    """
    all_data = []
    for path in object_paths:
      is_obj, data = self._GetObjectDetails(path)
      path = path if is_obj else path + '/'
      all_data.append((is_obj, {'path': path, 'data': data}))

    all_data = sorted(all_data, key=lambda o: (o[0], o[1]['path']))
    return [d[1] for d in all_data]

  def _GetObjectDetails(self, object_path):
    """Gets the actual object data for a given GCS path.

    Args:
      object_path: str, The gs:// path to an object or directory.

    Returns:
      (bool, data), Where element 0 is True if the path is an object, False if
      a directory and where data is either a storage.Object message (for
      objects) or a storage_util.ObjectReference for directories.
    """
    details = self._object_details.get(object_path)
    if details:
      return True, details
    else:
      # This isn't an object, must be a "directory" so just return the name
      # data.
      return False, storage_util.ObjectReference.FromUrl(
          object_path, allow_empty_object=True)

  def AbsPath(self, path):
    if not path.startswith('gs://'):
      raise ValueError('GCS paths must be absolute (starting with gs://)')
    return path

  def IsFile(self, path):
    exists, is_dir = self._Exists(path)
    return exists and not is_dir

  def IsDir(self, path):
    exists, is_dir = self._Exists(path)
    return exists and is_dir

  def Exists(self, path):
    exists, _ = self._Exists(path)
    return exists

  def _Exists(self, path):
    if self._IsRoot(path):
      # Root of the filesystem always exists
      return True, True

    path = path.rstrip('/')
    obj_ref = storage_util.ObjectReference.FromUrl(
        path, allow_empty_object=True)
    self._LoadObjectsIfMissing(obj_ref.bucket_ref)

    if obj_ref.bucket in self._objects:
      if not obj_ref.name:
        # Just a bucket, and it exists.
        return True, True
      if obj_ref.name in self._objects[obj_ref.bucket]:
        # This is an object and it exists.
        return True, False
      # See if this is a directory prefix of an existing object.
      dir_name = self._GetDirString(obj_ref.name)
      for i in self._objects[obj_ref.bucket]:
        if i.startswith(dir_name):
          return True, True

    return False, False

  def ListDir(self, path):
    if self._IsRoot(path):
      # The contents of the root filesystem are the buckets in the current
      # project.
      for b in self._client.ListBuckets(
          project=properties.VALUES.core.project.Get(required=True)):
        yield b.name
      return

    obj_ref = storage_util.ObjectReference.FromUrl(
        path, allow_empty_object=True)
    self._LoadObjectsIfMissing(obj_ref.bucket_ref)

    dir_name = self._GetDirString(obj_ref.name)
    parent_dir_length = len(dir_name)

    seen = set()
    for obj_name in self._objects[obj_ref.bucket]:
      if obj_name.startswith(dir_name):
        suffix = obj_name[parent_dir_length:]
        result = suffix.split(self._sep)[0]
        if result not in seen:
          seen.add(result)
          yield result

  def Join(self, path1, path2):
    if self._IsRoot(path1):
      return 'gs://' + path2.lstrip(self._sep)
    return path1.rstrip(self._sep) + self._sep + path2.lstrip(self._sep)

  def _IsRoot(self, path):
    return path == 'gs://' or path == 'gs:'

  def _LoadObjectsIfMissing(self, bucket_ref):
    objects = self._objects.get(bucket_ref.bucket)
    if objects is None:
      try:
        objects = self._client.ListBucket(bucket_ref)
        object_names = set()
        for o in objects:
          full_path = 'gs://' + self.Join(bucket_ref.bucket, o.name)
          self._object_details[full_path] = o
          object_names.add(o.name)
        # Only try to set the result after we start iterating because the API
        # call is not actually made until you try to consume the results. If
        # an API error occurs (like the bucket doesn't exist) we don't want
        # to accidentally cache that it was found.
        self._objects.setdefault(bucket_ref.bucket, set()).update(object_names)
      except storage_api.BucketNotFoundError:
        pass

  def _GetDirString(self, path):
    if path and not path.endswith(self._sep):
      return path + self._sep
    return path

  def _FormatPath(self, path):
    path = super(GCSPathExpander, self)._FormatPath(path)
    return 'gs://' if path == 'gs:/' else path