Source code for datasketch.lean_minhash

from __future__ import annotations
import struct
from typing import Iterable
import numpy as np

from datasketch import MinHash


[docs] class LeanMinHash(MinHash): """Lean MinHash is MinHash with a smaller memory footprint and faster deserialization, but with its internal state frozen -- no `update()`. Lean MinHash inherits all methods from :class:`datasketch.MinHash`. It does not store the `permutations` and the `hashfunc` needed for updating. If a MinHash does not need further updates, convert it into a lean MinHash to save memory. Example: To create a lean MinHash from an existing MinHash: .. code-block:: python lean_minhash = LeanMinHash(minhash) # You can compute the Jaccard similarity between two lean MinHash lean_minhash.jaccard(lean_minhash2) # Or between a lean MinHash and a MinHash lean_minhash.jaccard(minhash2) To create a lean MinHash from the hash values and seed of an existing MinHash: .. code-block:: python lean_minhash = LeanMinHash(seed=minhash.seed, hashvalues=minhash.hashvalues) To create a MinHash from a lean MinHash: .. code-block:: python minhash = MinHash(seed=lean_minhash.seed, hashvalues=lean_minhash.hashvalues) # Or if you want to prevent further updates on minhash # from affecting the state of lean_minhash minhash = MinHash(seed=lean_minhash.seed, hashvalues=lean_minhash.digest()) Note: Lean MinHash can also be used in :class:`datasketch.MinHashLSH`, :class:`datasketch.MinHashLSHForest`, and :class:`datasketch.MinHashLSHEnsemble`. Args: minhash (optional): The :class:`datasketch.MinHash` object used to initialize the LeanMinHash. If this is not set, then `seed` and `hashvalues` must be set. seed (optional): The random seed that controls the set of random permutation functions generated for this LeanMinHash. This parameter must be used together with `hashvalues`. hashvalues (optional): The hash values used to inititialize the state of the LeanMinHash. This parameter must be used together with `seed`. """ __slots__ = ("seed", "hashvalues") def _initialize_slots(self, seed, hashvalues): """Initialize the slots of the LeanMinHash. Args: seed (int): The random seed controls the set of random permutation functions generated for this LeanMinHash. hashvalues (Iterable): The hash values is the internal state of the LeanMinHash. """ self.seed = seed self.hashvalues = self._parse_hashvalues(hashvalues)
[docs] def __init__( self, minhash: MinHash = None, seed: int = None, hashvalues: Iterable = None ): if minhash is not None: self._initialize_slots(minhash.seed, minhash.hashvalues) elif hashvalues is not None and seed is not None: self._initialize_slots(seed, hashvalues) else: raise ValueError( "Init parameters cannot be None: make sure " "to set either minhash or both of hash values and seed" )
[docs] def update(self, b) -> None: """This method is not available on a LeanMinHash. Calling it raises a TypeError. """ raise TypeError("Cannot update a LeanMinHash")
[docs] def copy(self) -> LeanMinHash: lmh = object.__new__(LeanMinHash) lmh._initialize_slots(*self.__slots__) return lmh
[docs] def bytesize(self, byteorder="@") -> int: """Compute the byte size after serialization. Args: byteorder (str, optional): This is byte order of the serialized data. Use one of the `byte order characters <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_: ``@``, ``=``, ``<``, ``>``, and ``!``. Default is ``@`` -- the native order. Returns: int: Size in number of bytes after serialization. """ # Use 8 bytes to store the seed integer seed_size = struct.calcsize(byteorder + "q") # Use 4 bytes to store the number of hash values length_size = struct.calcsize(byteorder + "i") # Use 4 bytes to store each hash value as we are using the lower 32 bit hashvalue_size = struct.calcsize(byteorder + "I") return seed_size + length_size + len(self) * hashvalue_size
[docs] def serialize(self, buf, byteorder="@") -> None: """ Serialize this lean MinHash and store the result in an allocated buffer. Args: buf (buffer): `buf` must implement the `buffer`_ interface. One such example is the built-in `bytearray`_ class. byteorder (str, optional): This is byte order of the serialized data. Use one of the `byte order characters <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_: ``@``, ``=``, ``<``, ``>``, and ``!``. Default is ``@`` -- the native order. This is preferred over using `pickle`_ if the serialized lean MinHash needs to be used by another program in a different programming language. The serialization schema: 1. The first 8 bytes is the seed integer 2. The next 4 bytes is the number of hash values 3. The rest is the serialized hash values, each uses 4 bytes Example: To serialize a single lean MinHash into a `bytearray`_ buffer. .. code-block:: python buf = bytearray(lean_minhash.bytesize()) lean_minhash.serialize(buf) To serialize multiple lean MinHash into a `bytearray`_ buffer. .. code-block:: python # assuming lean_minhashs is a list of LeanMinHash with the same size size = lean_minhashs[0].bytesize() buf = bytearray(size*len(lean_minhashs)) for i, lean_minhash in enumerate(lean_minhashs): lean_minhash.serialize(buf[i*size:]) .. _`buffer`: https://docs.python.org/3/c-api/buffer.html .. _`bytearray`: https://docs.python.org/3.6/library/functions.html#bytearray .. _`byteorder`: https://docs.python.org/3/library/struct.html """ if len(buf) < self.bytesize(): raise ValueError( "The buffer does not have enough space\ for holding this MinHash." ) fmt = "%sqi%dI" % (byteorder, len(self)) struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues)
[docs] @classmethod def deserialize(cls, buf, byteorder="@") -> LeanMinHash: """ Deserialize a lean MinHash from a buffer. Args: buf (buffer): `buf` must implement the `buffer`_ interface. One such example is the built-in `bytearray`_ class. byteorder (str. optional): This is byte order of the serialized data. Use one of the `byte order characters <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_: ``@``, ``=``, ``<``, ``>``, and ``!``. Default is ``@`` -- the native order. Return: datasketch.LeanMinHash: The deserialized lean MinHash Example: To deserialize a lean MinHash from a buffer. .. code-block:: python lean_minhash = LeanMinHash.deserialize(buf) """ fmt_seed_size = "%sqi" % byteorder fmt_hash = byteorder + "%dI" try: seed, num_perm = struct.unpack_from(fmt_seed_size, buf, 0) except TypeError: seed, num_perm = struct.unpack_from(fmt_seed_size, buffer(buf), 0) offset = struct.calcsize(fmt_seed_size) try: hashvalues = struct.unpack_from(fmt_hash % num_perm, buf, offset) except TypeError: hashvalues = struct.unpack_from(fmt_hash % num_perm, buffer(buf), offset) lmh = object.__new__(LeanMinHash) lmh._initialize_slots(seed, hashvalues) return lmh
[docs] def __getstate__(self): buf = bytearray(self.bytesize()) fmt = "qi%dI" % len(self) struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues) return buf
def __setstate__(self, buf): try: seed, num_perm = struct.unpack_from("qi", buf, 0) except TypeError: seed, num_perm = struct.unpack_from("qi", buffer(buf), 0) offset = struct.calcsize("qi") try: hashvalues = struct.unpack_from("%dI" % num_perm, buf, offset) except TypeError: hashvalues = struct.unpack_from("%dI" % num_perm, buffer(buf), offset) self._initialize_slots(seed, hashvalues)
[docs] def __hash__(self) -> int: return hash((self.seed, tuple(self.hashvalues)))
[docs] @classmethod def union(cls, *lmhs: LeanMinHash) -> LeanMinHash: """Create a new lean MinHash by unioning multiple lean MinHash.""" if len(lmhs) < 2: raise ValueError("Cannot union less than 2 MinHash") num_perm = len(lmhs[0]) seed = lmhs[0].seed if any((seed != m.seed or num_perm != len(m)) for m in lmhs): raise ValueError( "The unioning MinHash must have the\ same seed, number of permutation functions." ) hashvalues = np.minimum.reduce([m.hashvalues for m in lmhs]) lmh = object.__new__(LeanMinHash) lmh._initialize_slots(seed, hashvalues) return lmh