Source code for datasketch.lean_minhash

import struct
import numpy as np

from datasketch import MinHash

[docs]class LeanMinHash(MinHash): '''Lean MinHash is MinHash with a smaller memory footprint and faster deserialization, but with its internal state frozen -- no `update()`. Lean MinHash inherits all methods from :class:`datasketch.MinHash`. It does not store the `permutations` and the `hashfunc` needed for updating. If a MinHash does not need further updates, convert it into a lean MinHash to save memory. Example: To create a lean MinHash from an existing MinHash: .. code-block:: python lean_minhash = LeanMinHash(minhash) # You can compute the Jaccard similarity between two lean MinHash lean_minhash.jaccard(lean_minhash2) # Or between a lean MinHash and a MinHash lean_minhash.jaccard(minhash2) To create a lean MinHash from the hash values and seed of an existing MinHash: .. code-block:: python lean_minhash = LeanMinHash(seed=minhash.seed, hashvalues=minhash.hashvalues) To create a MinHash from a lean MinHash: .. code-block:: python minhash = MinHash(seed=lean_minhash.seed, hashvalues=lean_minhash.hashvalues) # Or if you want to prevent further updates on minhash # from affecting the state of lean_minhash minhash = MinHash(seed=lean_minhash.seed, hashvalues=lean_minhash.digest()) Note: Lean MinHash can also be used in :class:`datasketch.MinHashLSH`, :class:`datasketch.MinHashLSHForest`, and :class:`datasketch.MinHashLSHEnsemble`. Args: minhash (optional): The :class:`datasketch.MinHash` object used to initialize the LeanMinHash. If this is not set, then `seed` and `hashvalues` must be set. seed (optional): The random seed that controls the set of random permutation functions generated for this LeanMinHash. This parameter must be used together with `hashvalues`. hashvalues (optional): The hash values used to inititialize the state of the LeanMinHash. This parameter must be used together with `seed`. ''' __slots__ = ('seed', 'hashvalues') def _initialize_slots(self, seed, hashvalues): '''Initialize the slots of the LeanMinHash. Args: seed (int): The random seed controls the set of random permutation functions generated for this LeanMinHash. hashvalues: The hash values is the internal state of the LeanMinHash. ''' self.seed = seed self.hashvalues = self._parse_hashvalues(hashvalues)
[docs] def __init__(self, minhash=None, seed=None, hashvalues=None): if minhash is not None: self._initialize_slots(minhash.seed, minhash.hashvalues) elif hashvalues is not None and seed is not None: self._initialize_slots(seed, hashvalues) else: raise ValueError("Init parameters cannot be None: make sure " "to set either minhash or both of hash values and seed")
[docs] def update(self, b): '''This method is not available on a LeanMinHash. Calling it raises a TypeError. ''' raise TypeError("Cannot update a LeanMinHash")
[docs] def copy(self): lmh = object.__new__(LeanMinHash) lmh._initialize_slots(*self.__slots__) return lmh
[docs] def bytesize(self, byteorder='@'): '''Compute the byte size after serialization. Args: byteorder (str, optional): This is byte order of the serialized data. Use one of the `byte order characters <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_: ``@``, ``=``, ``<``, ``>``, and ``!``. Default is ``@`` -- the native order. Returns: int: Size in number of bytes after serialization. ''' # Use 8 bytes to store the seed integer seed_size = struct.calcsize(byteorder+'q') # Use 4 bytes to store the number of hash values length_size = struct.calcsize(byteorder+'i') # Use 4 bytes to store each hash value as we are using the lower 32 bit hashvalue_size = struct.calcsize(byteorder+'I') return seed_size + length_size + len(self) * hashvalue_size
[docs] def serialize(self, buf, byteorder='@'): ''' Serialize this lean MinHash and store the result in an allocated buffer. Args: buf (buffer): `buf` must implement the `buffer`_ interface. One such example is the built-in `bytearray`_ class. byteorder (str, optional): This is byte order of the serialized data. Use one of the `byte order characters <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_: ``@``, ``=``, ``<``, ``>``, and ``!``. Default is ``@`` -- the native order. This is preferred over using `pickle`_ if the serialized lean MinHash needs to be used by another program in a different programming language. The serialization schema: 1. The first 8 bytes is the seed integer 2. The next 4 bytes is the number of hash values 3. The rest is the serialized hash values, each uses 4 bytes Example: To serialize a single lean MinHash into a `bytearray`_ buffer. .. code-block:: python buf = bytearray(lean_minhash.bytesize()) lean_minhash.serialize(buf) To serialize multiple lean MinHash into a `bytearray`_ buffer. .. code-block:: python # assuming lean_minhashs is a list of LeanMinHash with the same size size = lean_minhashs[0].bytesize() buf = bytearray(size*len(lean_minhashs)) for i, lean_minhash in enumerate(lean_minhashs): lean_minhash.serialize(buf[i*size:]) .. _`buffer`: https://docs.python.org/3/c-api/buffer.html .. _`bytearray`: https://docs.python.org/3.6/library/functions.html#bytearray .. _`byteorder`: https://docs.python.org/3/library/struct.html ''' if len(buf) < self.bytesize(): raise ValueError("The buffer does not have enough space\ for holding this MinHash.") fmt = "%sqi%dI" % (byteorder, len(self)) struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues)
[docs] @classmethod def deserialize(cls, buf, byteorder='@'): ''' Deserialize a lean MinHash from a buffer. Args: buf (buffer): `buf` must implement the `buffer`_ interface. One such example is the built-in `bytearray`_ class. byteorder (str. optional): This is byte order of the serialized data. Use one of the `byte order characters <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_: ``@``, ``=``, ``<``, ``>``, and ``!``. Default is ``@`` -- the native order. Return: datasketch.LeanMinHash: The deserialized lean MinHash Example: To deserialize a lean MinHash from a buffer. .. code-block:: python lean_minhash = LeanMinHash.deserialize(buf) ''' fmt_seed_size = "%sqi" % byteorder fmt_hash = byteorder + "%dI" try: seed, num_perm = struct.unpack_from(fmt_seed_size, buf, 0) except TypeError: seed, num_perm = struct.unpack_from(fmt_seed_size, buffer(buf), 0) offset = struct.calcsize(fmt_seed_size) try: hashvalues = struct.unpack_from(fmt_hash % num_perm, buf, offset) except TypeError: hashvalues = struct.unpack_from(fmt_hash % num_perm, buffer(buf), offset) lmh = object.__new__(LeanMinHash) lmh._initialize_slots(seed, hashvalues) return lmh
def __getstate__(self): buf = bytearray(self.bytesize()) fmt = "qi%dI" % len(self) struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues) return buf def __setstate__(self, buf): try: seed, num_perm = struct.unpack_from('qi', buf, 0) except TypeError: seed, num_perm = struct.unpack_from('qi', buffer(buf), 0) offset = struct.calcsize('qi') try: hashvalues = struct.unpack_from('%dI' % num_perm, buf, offset) except TypeError: hashvalues = struct.unpack_from('%dI' % num_perm, buffer(buf), offset) self._initialize_slots(seed, hashvalues)
[docs] def __hash__(self): return hash((self.seed, tuple(self.hashvalues)))
[docs] @classmethod def union(cls, *lmhs): if len(lmhs) < 2: raise ValueError("Cannot union less than 2 MinHash") num_perm = len(lmhs[0]) seed = lmhs[0].seed if any((seed != m.seed or num_perm != len(m)) for m in lmhs): raise ValueError("The unioning MinHash must have the\ same seed, number of permutation functions.") hashvalues = np.minimum.reduce([m.hashvalues for m in lmhs]) lmh = object.__new__(LeanMinHash) lmh._initialize_slots(seed, hashvalues) return lmh