File: //snap/google-cloud-cli/396/lib/surface/storage/hash.py
# -*- coding: utf-8 -*- #
# Copyright 2021 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implementation of hash command for getting formatted file hashes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import base64
import binascii
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.storage import encryption_util
from googlecloudsdk.command_lib.storage import errors
from googlecloudsdk.command_lib.storage import fast_crc32c_util
from googlecloudsdk.command_lib.storage import flags
from googlecloudsdk.command_lib.storage import hash_util
from googlecloudsdk.command_lib.storage import storage_url
from googlecloudsdk.command_lib.storage import wildcard_iterator
from googlecloudsdk.command_lib.storage.resources import resource_reference
from googlecloudsdk.command_lib.util import crc32c
from googlecloudsdk.core import log
_DIGEST_FORMAT_KEY = 'digest_format'
_CRC32C_HASH_KEY = 'crc32c_hash'
_MD5_HASH_KEY = 'md5_hash'
_URL_KEY = 'url'
def _convert_base64_to_hex(base64_string):
"""Converts base64 hash digest to hex-formatted hash digest string."""
if base64_string is None:
return None
return binascii.hexlify(
base64.b64decode(
base64_string.strip('\n"\'').encode('utf-8'))).decode('utf-8')
def _is_object_or_file_resource(resource):
return isinstance(resource, (resource_reference.ObjectResource,
resource_reference.FileObjectResource))
def _get_resource_iterator(url_strings):
"""Wildcard matches and recurses into top-level of buckets."""
any_url_matched = False
for url_string in url_strings:
wildcard_expanded_iterator = wildcard_iterator.get_wildcard_iterator(
url_string,
error_on_missing_key=False,
fetch_encrypted_object_hashes=True)
this_url_matched = False
for wildcard_expanded_resource in wildcard_expanded_iterator:
if _is_object_or_file_resource(wildcard_expanded_resource):
any_url_matched = this_url_matched = True
yield wildcard_expanded_resource
elif (isinstance(wildcard_expanded_resource.storage_url,
storage_url.CloudUrl) and
wildcard_expanded_resource.storage_url.is_bucket()):
bucket_expanded_iterator = wildcard_iterator.get_wildcard_iterator(
wildcard_expanded_resource.storage_url.join('*').url_string,
error_on_missing_key=False)
for bucket_expanded_resource in bucket_expanded_iterator:
if isinstance(bucket_expanded_resource,
(resource_reference.ObjectResource)):
any_url_matched = this_url_matched = True
yield bucket_expanded_resource
if not this_url_matched:
log.warning('No matches found for {}'.format(url_string))
if not any_url_matched:
raise errors.InvalidUrlError('No URLS matched.')
@base.UniverseCompatible
class Hash(base.Command):
"""Calculates hashes on local or cloud files."""
detailed_help = {
'DESCRIPTION':
"""
Calculates hashes on local or cloud files that can be used to compare with
"gcloud storage ls -L" output. If a specific hash option is not provided,
this command calculates all gcloud storage-supported hashes for the file.
Note that gcloud storage automatically performs hash validation when
uploading or downloading files, so this command is only needed if you want
to write a script that separately checks the hash for some reason.
If you calculate a CRC32C hash for the file without a precompiled
google-crc32c installation, hashing will be very slow.
""",
'EXAMPLES':
"""
To get the MD5 and CRC32C hash digest of a cloud object in Base64 format:
$ {command} gs://bucket/object
To get just the MD5 hash digest of a local object in hex format:
$ {command} /dir/object.txt --skip-crc32c --hex
""",
}
@staticmethod
def Args(parser):
parser.add_argument(
'urls', nargs='+', help='Local or cloud URLs of objects to hash.')
parser.add_argument(
'--hex',
action='store_true',
help='Output hash digests in hex format. By default, digests are'
' displayed in base64.')
skip_flags_group = parser.add_group(mutex=True)
skip_flags_group.add_argument(
'--skip-crc32c',
action='store_true',
help='Skip CRC32C hash calculation. Useful if command is running slow.')
skip_flags_group.add_argument(
'--skip-md5',
action='store_true',
help='Skip MD5 hash calculation. Useful if command is running slow.')
flags.add_encryption_flags(parser, command_only_reads_data=True)
flags.add_additional_headers_flag(parser)
def Run(self, args):
encryption_util.initialize_key_store(args)
if not args.skip_crc32c:
if fast_crc32c_util.should_use_gcloud_crc32c():
crc32c_implementation = 'gcloud-crc32c (Go binary)'
elif crc32c.IS_FAST_GOOGLE_CRC32C_AVAILABLE:
crc32c_implementation = 'google-crc32c (Python binary)'
else:
crc32c_implementation = 'crcmod (slow pure Python implementation)'
log.info('CRC32C implementation: {}'.format(crc32c_implementation))
if args.hex:
hash_format = 'hex'
format_cloud_digest = _convert_base64_to_hex
format_file_hash_object = lambda x: x.hexdigest()
else:
hash_format = 'base64'
format_cloud_digest = lambda x: x
format_file_hash_object = hash_util.get_base64_hash_digest_string
for resource in _get_resource_iterator(args.urls):
output_dict = {
_DIGEST_FORMAT_KEY: hash_format,
}
if isinstance(resource, resource_reference.ObjectResource):
if resource.crc32c_hash is None and resource.md5_hash is None:
log.warning('No hashes found for {}'.format(resource))
continue
output_dict[_URL_KEY] = resource.storage_url.versionless_url_string
if not args.skip_crc32c:
output_dict[_CRC32C_HASH_KEY] = format_cloud_digest(
resource.crc32c_hash)
if not args.skip_md5:
output_dict[_MD5_HASH_KEY] = format_cloud_digest(resource.md5_hash)
else: # FileObjectResource
output_dict[_URL_KEY] = resource.storage_url.resource_name
if not args.skip_crc32c:
output_dict[_CRC32C_HASH_KEY] = format_file_hash_object(
hash_util.get_hash_from_file(
resource.storage_url.resource_name,
hash_util.HashAlgorithm.CRC32C,
)
)
if not args.skip_md5:
output_dict[_MD5_HASH_KEY] = format_file_hash_object(
hash_util.get_hash_from_file(
resource.storage_url.resource_name,
hash_util.HashAlgorithm.MD5,
)
)
yield output_dict