HEX

File: //snap/google-cloud-cli/394/lib/googlecloudsdk/appengine/tools/context_util.py
# Copyright 2015 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""The implementation of generating a source context file."""

import json
import logging
import os
import re
import subprocess

from googlecloudsdk.appengine._internal import six_subset

_REMOTE_URL_PATTERN = r'remote\.(.*)\.url'

_CLOUD_REPO_PATTERN = (
    r'^https://'
    '(?P<hostname>[^/]*)/'
    '(?P<id_type>p|id)/'
    '(?P<project_or_repo_id>[^/?#]+)'
    '(/r/(?P<repo_name>[^/?#]+))?'
    '([/#?].*)?')

_GIT_PENDING_CHANGE_PATTERN = (
    '^# *('
    'Untracked files|'
    'Changes to be committed|'
    'Changes not staged for commit'
    '):')

CAPTURE_CATEGORY = 'capture'
REMOTE_REPO_CATEGORY = 'remote_repo'
CONTEXT_FILENAME = 'source-context.json'

# Keep this global name to protect against unexpected breakages.
EXT_CONTEXT_FILENAME = 'source-contexts.json'


class _ContextType(object):
  """Ordered enumeration of context types.

  The ordering is based on which context information will provide the best
  user experience. Higher numbers are considered better than lower numbers.
  Google repositories have the highest ranking because they do not require
  additional authorization to view.
  """

  # No details are known about the context.
  OTHER = 0

  # A git repository stored on an unfamiliar host.
  GIT_UNKNOWN = 1

  # An ssh link to a git repository on a known host (Github or BitBucket)
  GIT_KNOWN_HOST_SSH = 2

  # An http link to a git repository on a known host (Github or BitBucket)
  GIT_KNOWN_HOST = 3

  # A google cloud repo.
  CLOUD_REPO = 4

  # User-requested captured snapshot of source code.
  SOURCE_CAPTURE = 5


_PROTOCOL_PATTERN = re.compile(r'^(?P<protocol>\w+):')
_DOMAIN_PATTERN = re.compile(r'^\w+://([^/]*[.@])?(?P<domain>\w+\.\w+)[/:]')


def _GetGitContextTypeFromDomain(url):
  """Returns the context type for the input Git url."""

  if not url:
    return _ContextType.GIT_UNKNOWN
  if not _PROTOCOL_PATTERN.match(url):
    # Assume ssh protocol to simplify parsing.
    url = 'ssh://' + url
  domain_match = _DOMAIN_PATTERN.match(url)
  protocol = _PROTOCOL_PATTERN.match(url).group('protocol')
  if domain_match:
    domain = domain_match.group('domain')
    if domain == 'google.com':
      return _ContextType.CLOUD_REPO
    elif domain == 'github.com' or domain == 'bitbucket.org':
      if protocol == 'ssh':
        return _ContextType.GIT_KNOWN_HOST_SSH
      else:
        return _ContextType.GIT_KNOWN_HOST
  return _ContextType.GIT_UNKNOWN


def _GetContextType(context, labels):
  """Returns the _ContextType for the input extended source context.

  Args:
    context: A source context dict.
    labels: A dict containing the labels associated with the context.
  Returns:
    The context type.
  """
  if labels.get('category') == CAPTURE_CATEGORY:
    return _ContextType.SOURCE_CAPTURE
  git_context = context.get('git')
  if git_context:
    return _GetGitContextTypeFromDomain(git_context.get('url'))
  if 'cloudRepo' in context:
    return _ContextType.CLOUD_REPO
  return _ContextType.OTHER


def _IsRemoteBetter(new_name, old_name):
  """Indicates if a new remote is better than an old one, based on remote name.

  Names are ranked as follows: If either name is "origin", it is considered
  best, otherwise the name that comes last alphabetically is considered best.

  The alphabetical ordering is arbitrary, but it was chosen because it is
  stable. We prefer "origin" because it is the standard name for the origin
  of cloned repos.

  Args:
    new_name: The name to be evaluated.
    old_name: The name to compare against.
  Returns:
    True iff new_name should replace old_name.
  """
  if not new_name or old_name == 'origin':
    return False
  if not old_name or new_name == 'origin':
    return True
  return new_name > old_name


class GenerateSourceContextError(Exception):
  """An error occurred while trying to create the source context."""
  pass


def IsCaptureContext(context):
  return context.get('labels', {}).get('category', None) == CAPTURE_CATEGORY


def ExtendContextDict(context, category=REMOTE_REPO_CATEGORY, remote_name=None):
  """Converts a source context dict to an ExtendedSourceContext dict.

  Args:
    context: A SourceContext-compatible dict
    category:  string indicating the category of context (either
        CAPTURE_CATEGORY or REMOTE_REPO_CATEGORY)
    remote_name: The name of the remote in git.
  Returns:
    An ExtendedSourceContext-compatible dict.
  """
  labels = {'category': category}
  if remote_name:
    labels['remote_name'] = remote_name
  return {'context': context, 'labels': labels}


def HasPendingChanges(source_directory):
  """Checks if the git repo in a directory has any pending changes.

  Args:
    source_directory: The path to directory containing the source code.
  Returns:
    True if there are any uncommitted or untracked changes in the local repo
    for the given directory.
  """
  status = _CallGit(source_directory, 'status')
  return re.search(_GIT_PENDING_CHANGE_PATTERN, status,
                   flags=re.MULTILINE)


def CalculateExtendedSourceContexts(source_directory):
  """Generate extended source contexts for a directory.

  Scans the remotes and revision of the git repository at source_directory,
  returning one or more ExtendedSourceContext-compatible dictionaries describing
  the repositories.

  Currently, this function will return only the Google-hosted repository
  associated with the directory, if one exists.

  Args:
    source_directory: The path to directory containing the source code.
  Returns:
    One or more ExtendedSourceContext-compatible dictionaries describing
    the remote repository or repositories associated with the given directory.
  Raises:
    GenerateSourceContextError: if source context could not be generated.
  """

  # First get all of the remote URLs from the source directory.
  remote_urls = _GetGitRemoteUrls(source_directory)
  if not remote_urls:
    raise GenerateSourceContextError(
        'Could not list remote URLs from source directory: %s' %
        source_directory)

  # Then get the current revision.
  source_revision = _GetGitHeadRevision(source_directory)
  if not source_revision:
    raise GenerateSourceContextError(
        'Could not find HEAD revision from the source directory: %s' %
        source_directory)

  # Now find any remote URLs that match a Google-hosted source context.
  source_contexts = []
  for remote_name, remote_url in remote_urls.items():
    source_context = _ParseSourceContext(
        remote_name, remote_url, source_revision)
    # Only add this to the list if it parsed correctly, and hasn't been seen.
    # We'd like to do this in O(1) using a set, but Python doesn't hash dicts.
    # The number of remotes should be small anyway, so keep it simple.
    if source_context and source_context not in source_contexts:
      source_contexts.append(source_context)

  # If source context is still None or ambiguous, we have no context to go by.
  if not source_contexts:
    raise GenerateSourceContextError(
        'Could not find any repository in the remote URLs for source '
        'directory: %s' % source_directory)
  return source_contexts


def BestSourceContext(source_contexts):
  """Returns the "best" source context from a list of contexts.

  "Best" is a heuristic that attempts to define the most useful context in
  a Google Cloud Platform application. The most useful context is defined as:

  1. The capture context, if there is one. (I.e., a context with category
     'capture')
  2. The Cloud Repo context, if there is one.
  3. A repo context from another known provider (i.e. github or bitbucket), if
     there is no Cloud Repo context.
  4. The generic git repo context, if not of the above apply.

  If there are two Cloud Repo contexts and one of them is a "capture" context,
  that context is considered best.

  If two Git contexts come from the same provider, they will be evaluated based
  on remote name: "origin" is the best name, followed by the name that comes
  last alphabetically.

  If all of the above does not resolve a tie, the tied context that is
  earliest in the source_contexts list wins.

  Args:
    source_contexts: A list of extended source contexts.
  Returns:
    A single source context, or None if source_contexts is empty.
  Raises:
    KeyError if any extended source context is malformed.
  """
  source_context = None
  best_type = None
  best_remote_name = None
  for ext_ctx in source_contexts:
    candidate = ext_ctx['context']
    labels = ext_ctx.get('labels', {})
    context_type = _GetContextType(candidate, labels)
    # On the first pass, best_type is None, so both of the if statements below
    # will fail, causing the first value to be considered best until/unless
    # there is a better one.
    if best_type and context_type < best_type:
      continue
    remote_name = labels.get('remote_name')
    if context_type == best_type and not _IsRemoteBetter(remote_name,
                                                         best_remote_name):
      continue
    source_context = candidate
    best_remote_name = remote_name
    best_type = context_type
  return source_context


def GetSourceContextFilesCreator(output_dir, source_contexts, source_dir=None):
  """Returns a function to create source context files in the given directory.

  The returned creator function will produce one file: source-context.json

  Args:
    output_dir: (String) The directory to create the files (usually the yaml
        directory).
    source_contexts: ([ExtendedSourceContext-compatible json dict])
        A list of json-serializable dicts containing source contexts. If None
        or empty, output_dir will be inspected to determine if it has an
        associated Git repo, and appropriate source contexts will be created
        for that directory.
    source_dir: (String) The location of the source files, for inferring source
        contexts when source_contexts is empty or None. If not specified,
        output_dir will be used instead.
  Returns:
    callable() - A function that will create source-context.json file in the
    given directory. The creator function will return a cleanup function which
    can be used to delete any files the creator function creates.

    If there are no source_contexts associated with the directory, the creator
    function will not create any files (and the cleanup function it returns
    will also do nothing).
  """

  if not source_contexts:
    source_contexts = _GetSourceContexts(source_dir or output_dir)
  if not source_contexts:
    creators = []
  else:
    creators = [_GetContextFileCreator(output_dir, source_contexts)]
  def Generate():
    cleanups = [g() for g in creators]
    def Cleanup():
      for c in cleanups:
        c()
    return Cleanup
  return Generate


def CreateContextFiles(output_dir, source_contexts, overwrite=False,
                       source_dir=None):
  """Creates source context file in the given directory if possible.

  Currently, only source-context.json file will be produced.

  Args:
    output_dir: (String) The directory to create the files (usually the yaml
        directory).
    source_contexts:  ([ExtendedSourceContext-compatible json dict])
        A list of json-serializable dicts containing source contexts. If None
        or empty, source context will be inferred from source_dir.
    overwrite: (boolean) If true, silently replace any existing file.
    source_dir: (String) The location of the source files, for inferring
        source contexts when source_contexts is empty or None. If not
        specified, output_dir will be used instead.
  Returns:
    ([String]) A list containing the names of the files created. If there are
    no source contexts found, or if the contexts files could not be created, the
    result will be an empty.
  """
  if not source_contexts:
    source_contexts = _GetSourceContexts(source_dir or output_dir)
    if not source_contexts:
      return []
  created = []
  for context_filename, context_object in [
      (CONTEXT_FILENAME, BestSourceContext(source_contexts))]:
    context_filename = os.path.join(output_dir, context_filename)
    try:
      if overwrite or not os.path.exists(context_filename):
        with open(context_filename, 'w') as f:
          json.dump(context_object, f)
        created.append(context_filename)
    except IOError as e:
      logging.warn('Could not generate [%s]: %s', context_filename, e)

  return created


def _CallGit(cwd, *args):
  """Calls git with the given args, in the given working directory.

  Args:
    cwd: The working directory for the command.
    *args: Any arguments for the git command.
  Returns:
    The raw output of the command, or None if the command failed.
  """
  try:
    output = subprocess.check_output(['git'] + list(args), cwd=cwd)
    if six_subset.PY3:
      output = output.decode('utf-8')
    return output
  except (OSError, subprocess.CalledProcessError) as e:
    logging.debug('Could not call git with args %s: %s', args, e)
    return None


def _GetGitRemoteUrlConfigs(source_directory):
  """Calls git to output every configured remote URL.

  Args:
    source_directory: The path to directory containing the source code.
  Returns:
    The raw output of the command, or None if the command failed.
  """
  return _CallGit(
      source_directory, 'config', '--get-regexp', _REMOTE_URL_PATTERN)


def _GetGitRemoteUrls(source_directory):
  """Finds the list of git remotes for the given source directory.

  Args:
    source_directory: The path to directory containing the source code.
  Returns:
    A dictionary of remote name to remote URL, empty if no remotes are found.
  """
  remote_url_config_output = _GetGitRemoteUrlConfigs(source_directory)
  if not remote_url_config_output:
    return {}

  result = {}
  config_lines = remote_url_config_output.split('\n')
  for config_line in config_lines:
    if not config_line:
      continue  # Skip blank lines.

    # Each line looks like "remote.<name>.url <url>.
    config_line_parts = config_line.split(' ')
    if len(config_line_parts) != 2:
      logging.debug('Skipping unexpected config line, incorrect segments: %s',
                    config_line)
      continue

    # Extract the two parts, then find the name of the remote.
    remote_url_config_name = config_line_parts[0]
    remote_url = config_line_parts[1]
    remote_url_name_match = re.match(
        _REMOTE_URL_PATTERN, remote_url_config_name)
    if not remote_url_name_match:
      logging.debug('Skipping unexpected config line, could not match '
                    'remote: %s', config_line)
      continue
    remote_url_name = remote_url_name_match.group(1)

    result[remote_url_name] = remote_url
  return result


def _GetGitHeadRevision(source_directory):
  """Finds the current HEAD revision for the given source directory.

  Args:
    source_directory: The path to directory containing the source code.
  Returns:
    The HEAD revision of the current branch, or None if the command failed.
  """
  raw_output = _CallGit(source_directory, 'rev-parse', 'HEAD')
  return raw_output.strip() if raw_output else None


def _ParseSourceContext(remote_name, remote_url, source_revision):
  """Parses the URL into a source context blob, if the URL is a git or GCP repo.

  Args:
    remote_name: The name of the remote.
    remote_url: The remote URL to parse.
    source_revision: The current revision of the source directory.
  Returns:
    An ExtendedSourceContext suitable for JSON.
  """
  # Assume it's a Git URL unless proven otherwise.
  context = None

  # Now try to interpret the input as a Cloud Repo URL, and change context
  # accordingly if it looks like one. Assume any seemingly malformed URL is
  # a valid Git URL, since the inputs to this function always come from Git.
  #
  # A cloud repo URL can take three forms:
  # 1: https://<hostname>/id/<repo_id>
  # 2: https://<hostname>/p/<project_id>
  # 3: https://<hostname>/p/<project_id>/r/<repo_name>
  #
  # There are two repo ID types. The first type is the direct repo ID,
  # <repo_id>, which uniquely identifies a repository. The second is the pair
  # (<project_id>, <repo_name>) which also uniquely identifies a repository.
  #
  # Case 2 is equivalent to case 3 with <repo_name> defaulting to "default".
  match = re.match(_CLOUD_REPO_PATTERN, remote_url)
  if match:
    # It looks like a GCP repo URL. Extract the repo ID blob from it.
    id_type = match.group('id_type')
    if id_type == 'id':
      raw_repo_id = match.group('project_or_repo_id')
      # A GCP URL with an ID can't have a repo specification. If it has
      # one, it's either malformed or it's a Git URL from some other service.
      if not match.group('repo_name'):
        context = {
            'cloudRepo': {
                'repoId': {
                    'uid': raw_repo_id
                },
                'revisionId': source_revision}}
    elif id_type == 'p':
      # Treat it as a project name plus an optional repo name.
      project_id = match.group('project_or_repo_id')
      repo_name = match.group('repo_name') or 'default'
      context = {
          'cloudRepo': {
              'repoId': {
                  'projectRepoId': {
                      'projectId': project_id,
                      'repoName': repo_name}},
              'revisionId': source_revision}}
    # else it doesn't look like a GCP URL

  if not context:
    context = {'git': {'url': remote_url, 'revisionId': source_revision}}

  return ExtendContextDict(context, remote_name=remote_name)


def _GetJsonFileCreator(name, json_object):
  """Creates a creator function for an extended source context file.

  Args:
    name: (String) The name of the file to generate.
    json_object: Any object compatible with json.dump.
  Returns:
    (callable()) A creator function that will create the file and return a
    cleanup function that will delete the file.
  """
  if os.path.exists(name):
    logging.warn('%s already exists. It will not be updated.', name)
    return lambda: (lambda: None)
  def Cleanup():
    os.remove(name)
  def Generate():
    try:
      with open(name, 'w') as f:
        json.dump(json_object, f)
    except IOError as e:
      logging.warn('Could not generate [%s]: %s', name, e)
    return Cleanup
  return Generate


def _GetContextFileCreator(output_dir, contexts):
  """Creates a creator function for an old-style source context file.

  Args:
    output_dir: (String) The name of the directory in which to generate the
        file. The file will be named source-context.json.
    contexts: ([dict]) A list of ExtendedSourceContext-compatible dicts for json
        serialization.
  Returns:
    A creator function that will create the file.
  """
  name = os.path.join(output_dir, CONTEXT_FILENAME)
  return _GetJsonFileCreator(name, BestSourceContext(contexts))


def _GetSourceContexts(source_dir):
  """Gets the source contexts associated with a directory.

  This function is mostly a wrapper around CalculateExtendedSourceContexts
  which logs a message if the context could not be determined.
  Args:
    source_dir: (String) The directory to inspect.
  Returns:
    [ExtendedSourceContext-compatible json dict] A list of 0 or more source
    contexts.
  """
  try:
    source_contexts = (CalculateExtendedSourceContexts(source_dir))
  except GenerateSourceContextError:
    # No valid source contexts.
    source_contexts = []
  if not source_contexts:
    logging.info(
        'Could not find any remote repositories associated with [%s]. '
        'Cloud diagnostic tools may not be able to display the correct '
        'source code for this deployment.', source_dir)
  return source_contexts