"""
Utilities for hashing objects.
"""
from __future__ import absolute_import, division
import base64
from io import IOBase
import pickle
from pickle import PickleError, PicklingError
import hashlib
import struct
from collections.abc import Iterable
from pkg_resources import resource_filename
import numpy as np
from pisa.utils.log import logging, set_verbosity
from pisa.utils.resources import find_resource
__all__ = [
'FAST_HASH_FILESIZE_BYTES',
'FAST_HASH_NDARRAY_ELEMENTS',
'FAST_HASH_STR_CHARS',
'hash_obj',
'hash_file',
'test_hash_obj',
'test_hash_file',
]
__author__ = 'J.L. Lanfranchi'
__license__ = '''Copyright (c) 2014-2017, The IceCube Collaboration
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.'''
FAST_HASH_FILESIZE_BYTES = int(1e4)
"""For a fast hash on a file object, this many bytes of the file are used"""
FAST_HASH_NDARRAY_ELEMENTS = int(1e3)
"""For a fast hash on a numpy array or matrix, this many elements of the array
or matrix are used"""
FAST_HASH_STR_CHARS = int(1e3)
"""For a fast hash on a string (or object's pickle string representation), this
many characters are used"""
# NOTE: adding @line_profile decorator slows down function to order of 10s of
# ms even if set_verbosity(0)!
[docs]
def hash_obj(obj, hash_to='int', full_hash=True):
"""Return hash for an object. Object can be a numpy ndarray or matrix
(which is serialized to a string), an open file (which has its contents
read), or any pickle-able Python object.
Note that only the first most-significant 8 bytes (64 bits) from the MD5
sum are used in the hash.
Parameters
----------
obj : object
Object to hash. Note that the larger the object, the longer it takes to
hash.
hash_to : string
'i', 'int', or 'integer': First 8 bytes of the MD5 sum are interpreted
as an integer.
'b', 'bin', or 'binary': MD5 sum digest; returns an 8-character string
'h', 'x', 'hex': MD5 sum hexdigest, (string of 16 characters)
'b64', 'base64': first 8 bytes of MD5 sum are base64 encoded (with '+'
and '-' as final two characters of encoding). Returns string of 11
characters.
full_hash : bool
If True, hash on the full object's contents (which can be slow) or if
False, hash on a partial object. For example, only a file's first kB is
read, and only 1000 elements (chosen at random) of a numpy ndarray are
hashed on. This mode of operation should suffice for e.g. a
minimization run, but should _not_ be used for storing to/loading from
disk.
Returns
-------
hash_val : int or string
See also
--------
hash_file : hash a file on disk by filename/path
"""
if hash_to is None:
hash_to = 'int'
hash_to = hash_to.lower()
pass_on_kw = dict(hash_to=hash_to, full_hash=full_hash)
# TODO: convert an existing hash to the desired type, if it isn't already
# in this type
if hasattr(obj, 'hash') and obj.hash is not None and obj.hash == obj.hash:
return obj.hash
# Handle numpy arrays and matrices specially
if isinstance(obj, (np.ndarray, np.matrix)):
if full_hash:
return hash_obj(obj.tobytes(), **pass_on_kw)
len_flat = obj.size
stride = 1 + (len_flat // FAST_HASH_NDARRAY_ELEMENTS)
sub_elements = obj.flat[0::stride]
return hash_obj(sub_elements.tobytes(), **pass_on_kw)
# Handle an open file object as a special case
if isinstance(obj, IOBase):
if full_hash:
return hash_obj(obj.read(), **pass_on_kw)
return hash_obj(obj.read(FAST_HASH_FILESIZE_BYTES), **pass_on_kw)
# Convert to string (if not one already) in a fast and generic way: pickle;
# this creates a binary string, which is fine for sending to hashlib
if not isinstance(obj, str):
try:
pkl = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
except (PickleError, PicklingError, TypeError):
# Recurse into an iterable that couldn't be pickled
if isinstance(obj, Iterable):
return hash_obj([hash_obj(subobj) for subobj in obj],
**pass_on_kw)
else:
logging.error('Failed to pickle `obj` "%s" of type "%s"',
obj, type(obj))
raise
obj = pkl
if full_hash:
try:
md5hash = hashlib.md5(obj)
except TypeError:
md5hash = hashlib.md5(obj.encode())
else:
# Grab just a subset of the string by changing the stride taken in the
# character array (but if the string is less than
# FAST_HASH_FILESIZE_BYTES, use a stride length of 1)
stride = 1 + (len(obj) // FAST_HASH_STR_CHARS)
try:
md5hash = hashlib.md5(obj[0::stride])
except TypeError:
md5hash = hashlib.md5(obj[0::stride].encode())
if hash_to in ['i', 'int', 'integer']:
hash_val, = struct.unpack('<q', md5hash.digest()[:8])
elif hash_to in ['b', 'bin', 'binary']:
hash_val = md5hash.digest()[:8]
elif hash_to in ['h', 'x', 'hex', 'hexadecimal']:
hash_val = md5hash.hexdigest()[:16]
elif hash_to in ['b64', 'base64']:
hash_val = base64.b64encode(md5hash.digest()[:8], '+-')
else:
raise ValueError('Unrecognized `hash_to`: "%s"' % (hash_to,))
return hash_val
[docs]
def hash_file(fname, hash_to=None, full_hash=True):
"""Return a hash for a file, passing contents through hash_obj function."""
resource = find_resource(fname)
with open(resource, 'rb') as f:
return hash_obj(f, hash_to=hash_to, full_hash=full_hash)
[docs]
def test_hash_obj():
"""Unit tests for `hash_obj` function"""
assert hash_obj('x') == 3783177783470249117
assert hash_obj('x', full_hash=False) == 3783177783470249117
assert hash_obj('x', hash_to='hex') == '9dd4e461268c8034'
assert hash_obj(object()) != hash_obj(object)
for nel in [10, 100, 1000]:
rs = np.random.RandomState(seed=0)
a = rs.rand(nel, nel, 2)
a0_h_full = hash_obj(a)
a0_h_part = hash_obj(a, full_hash=False)
rs = np.random.RandomState(seed=1)
a = rs.rand(nel, nel, 2)
a1_h_full = hash_obj(a)
a1_h_part = hash_obj(a, full_hash=False)
rs = np.random.RandomState(seed=2)
a = rs.rand(nel, nel, 2)
a2_h_full = hash_obj(a)
a2_h_part = hash_obj(a, full_hash=False)
assert a1_h_full != a0_h_full
assert a2_h_full != a0_h_full
assert a2_h_full != a1_h_full
assert a1_h_part != a0_h_part
assert a2_h_part != a0_h_part
assert a2_h_part != a1_h_part
logging.info('<< PASS : test_hash_obj >>')
# TODO: test_hash_file function requires a "standard" file to test on
[docs]
def test_hash_file():
"""Unit tests for `hash_file` function"""
file_hash = hash_file(resource_filename('pisa.utils', 'hash.py'))
logging.debug(file_hash)
file_hash = hash_file(resource_filename('pisa.utils', 'hash.py'),
full_hash=False)
logging.debug(file_hash)
logging.info('<< PASS : test_hash_file >>')
if __name__ == "__main__":
set_verbosity(1)
test_hash_obj()
test_hash_file()