File: //snap/google-cloud-cli/current/lib/surface/storage/hash.py
# -*- coding: utf-8 -*- #
# Copyright 2021 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implementation of hash command for getting formatted file hashes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import base64
import binascii
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.storage import encryption_util
from googlecloudsdk.command_lib.storage import errors
from googlecloudsdk.command_lib.storage import fast_crc32c_util
from googlecloudsdk.command_lib.storage import flags
from googlecloudsdk.command_lib.storage import hash_util
from googlecloudsdk.command_lib.storage import storage_url
from googlecloudsdk.command_lib.storage import wildcard_iterator
from googlecloudsdk.command_lib.storage.resources import resource_reference
from googlecloudsdk.command_lib.util import crc32c
from googlecloudsdk.core import log
_DIGEST_FORMAT_KEY = 'digest_format'
_CRC32C_HASH_KEY = 'crc32c_hash'
_MD5_HASH_KEY = 'md5_hash'
_URL_KEY = 'url'
def _convert_base64_to_hex(base64_string):
  """Converts base64 hash digest to hex-formatted hash digest string."""
  if base64_string is None:
    return None
  return binascii.hexlify(
      base64.b64decode(
          base64_string.strip('\n"\'').encode('utf-8'))).decode('utf-8')
def _is_object_or_file_resource(resource):
  return isinstance(resource, (resource_reference.ObjectResource,
                               resource_reference.FileObjectResource))
def _get_resource_iterator(url_strings):
  """Wildcard matches and recurses into top-level of buckets."""
  any_url_matched = False
  for url_string in url_strings:
    wildcard_expanded_iterator = wildcard_iterator.get_wildcard_iterator(
        url_string,
        error_on_missing_key=False,
        fetch_encrypted_object_hashes=True)
    this_url_matched = False
    for wildcard_expanded_resource in wildcard_expanded_iterator:
      if _is_object_or_file_resource(wildcard_expanded_resource):
        any_url_matched = this_url_matched = True
        yield wildcard_expanded_resource
      elif (isinstance(wildcard_expanded_resource.storage_url,
                       storage_url.CloudUrl) and
            wildcard_expanded_resource.storage_url.is_bucket()):
        bucket_expanded_iterator = wildcard_iterator.get_wildcard_iterator(
            wildcard_expanded_resource.storage_url.join('*').url_string,
            error_on_missing_key=False)
        for bucket_expanded_resource in bucket_expanded_iterator:
          if isinstance(bucket_expanded_resource,
                        (resource_reference.ObjectResource)):
            any_url_matched = this_url_matched = True
            yield bucket_expanded_resource
    if not this_url_matched:
      log.warning('No matches found for {}'.format(url_string))
  if not any_url_matched:
    raise errors.InvalidUrlError('No URLS matched.')
@base.UniverseCompatible
class Hash(base.Command):
  """Calculates hashes on local or cloud files."""
  detailed_help = {
      'DESCRIPTION':
          """
      Calculates hashes on local or cloud files that can be used to compare with
      "gcloud storage ls -L" output. If a specific hash option is not provided,
      this command calculates all gcloud storage-supported hashes for the file.
      Note that gcloud storage automatically performs hash validation when
      uploading or downloading files, so this command is only needed if you want
      to write a script that separately checks the hash for some reason.
      If you calculate a CRC32C hash for the file without a precompiled
      google-crc32c installation, hashing will be very slow.
      """,
      'EXAMPLES':
          """
      To get the MD5 and CRC32C hash digest of a cloud object in Base64 format:
        $ {command} gs://bucket/object
      To get just the MD5 hash digest of a local object in hex format:
        $ {command} /dir/object.txt --skip-crc32c --hex
      """,
  }
  @staticmethod
  def Args(parser):
    parser.add_argument(
        'urls', nargs='+', help='Local or cloud URLs of objects to hash.')
    parser.add_argument(
        '--hex',
        action='store_true',
        help='Output hash digests in hex format. By default, digests are'
        ' displayed in base64.')
    skip_flags_group = parser.add_group(mutex=True)
    skip_flags_group.add_argument(
        '--skip-crc32c',
        action='store_true',
        help='Skip CRC32C hash calculation. Useful if command is running slow.')
    skip_flags_group.add_argument(
        '--skip-md5',
        action='store_true',
        help='Skip MD5 hash calculation. Useful if command is running slow.')
    flags.add_encryption_flags(parser, command_only_reads_data=True)
    flags.add_additional_headers_flag(parser)
  def Run(self, args):
    encryption_util.initialize_key_store(args)
    if not args.skip_crc32c:
      if fast_crc32c_util.should_use_gcloud_crc32c():
        crc32c_implementation = 'gcloud-crc32c (Go binary)'
      elif crc32c.IS_FAST_GOOGLE_CRC32C_AVAILABLE:
        crc32c_implementation = 'google-crc32c (Python binary)'
      else:
        crc32c_implementation = 'crcmod (slow pure Python implementation)'
      log.info('CRC32C implementation: {}'.format(crc32c_implementation))
    if args.hex:
      hash_format = 'hex'
      format_cloud_digest = _convert_base64_to_hex
      format_file_hash_object = lambda x: x.hexdigest()
    else:
      hash_format = 'base64'
      format_cloud_digest = lambda x: x
      format_file_hash_object = hash_util.get_base64_hash_digest_string
    for resource in _get_resource_iterator(args.urls):
      output_dict = {
          _DIGEST_FORMAT_KEY: hash_format,
      }
      if isinstance(resource, resource_reference.ObjectResource):
        if resource.crc32c_hash is None and resource.md5_hash is None:
          log.warning('No hashes found for {}'.format(resource))
          continue
        output_dict[_URL_KEY] = resource.storage_url.versionless_url_string
        if not args.skip_crc32c:
          output_dict[_CRC32C_HASH_KEY] = format_cloud_digest(
              resource.crc32c_hash)
        if not args.skip_md5:
          output_dict[_MD5_HASH_KEY] = format_cloud_digest(resource.md5_hash)
      else:  # FileObjectResource
        output_dict[_URL_KEY] = resource.storage_url.resource_name
        if not args.skip_crc32c:
          output_dict[_CRC32C_HASH_KEY] = format_file_hash_object(
              hash_util.get_hash_from_file(
                  resource.storage_url.resource_name,
                  hash_util.HashAlgorithm.CRC32C,
              )
          )
        if not args.skip_md5:
          output_dict[_MD5_HASH_KEY] = format_file_hash_object(
              hash_util.get_hash_from_file(
                  resource.storage_url.resource_name,
                  hash_util.HashAlgorithm.MD5,
              )
          )
      yield output_dict