from __future__ import annotations
import struct
from typing import Iterable
import numpy as np
from datasketch import MinHash
[docs]
class LeanMinHash(MinHash):
"""Lean MinHash is MinHash with a smaller memory footprint
and faster deserialization, but with its internal state frozen
-- no `update()`.
Lean MinHash inherits all methods from :class:`datasketch.MinHash`.
It does not store the `permutations` and the `hashfunc` needed for updating.
If a MinHash does not need further updates, convert it into a lean MinHash
to save memory.
Example:
To create a lean MinHash from an existing MinHash:
.. code-block:: python
lean_minhash = LeanMinHash(minhash)
# You can compute the Jaccard similarity between two lean MinHash
lean_minhash.jaccard(lean_minhash2)
# Or between a lean MinHash and a MinHash
lean_minhash.jaccard(minhash2)
To create a lean MinHash from the hash values and seed of an existing
MinHash:
.. code-block:: python
lean_minhash = LeanMinHash(seed=minhash.seed,
hashvalues=minhash.hashvalues)
To create a MinHash from a lean MinHash:
.. code-block:: python
minhash = MinHash(seed=lean_minhash.seed,
hashvalues=lean_minhash.hashvalues)
# Or if you want to prevent further updates on minhash
# from affecting the state of lean_minhash
minhash = MinHash(seed=lean_minhash.seed,
hashvalues=lean_minhash.digest())
Note:
Lean MinHash can also be used in :class:`datasketch.MinHashLSH`,
:class:`datasketch.MinHashLSHForest`, and :class:`datasketch.MinHashLSHEnsemble`.
Args:
minhash (optional): The :class:`datasketch.MinHash` object used to
initialize the LeanMinHash. If this is not set, then `seed`
and `hashvalues` must be set.
seed (optional): The random seed that controls the set of random
permutation functions generated for this LeanMinHash. This parameter
must be used together with `hashvalues`.
hashvalues (optional): The hash values used to inititialize the state
of the LeanMinHash. This parameter must be used together with
`seed`.
"""
__slots__ = ("seed", "hashvalues")
def _initialize_slots(self, seed, hashvalues):
"""Initialize the slots of the LeanMinHash.
Args:
seed (int): The random seed controls the set of random
permutation functions generated for this LeanMinHash.
hashvalues (Iterable): The hash values is the internal state of the LeanMinHash.
"""
self.seed = seed
self.hashvalues = self._parse_hashvalues(hashvalues)
[docs]
def __init__(
self, minhash: MinHash = None, seed: int = None, hashvalues: Iterable = None
):
if minhash is not None:
self._initialize_slots(minhash.seed, minhash.hashvalues)
elif hashvalues is not None and seed is not None:
self._initialize_slots(seed, hashvalues)
else:
raise ValueError(
"Init parameters cannot be None: make sure "
"to set either minhash or both of hash values and seed"
)
[docs]
def update(self, b) -> None:
"""This method is not available on a LeanMinHash.
Calling it raises a TypeError.
"""
raise TypeError("Cannot update a LeanMinHash")
[docs]
def copy(self) -> LeanMinHash:
lmh = object.__new__(LeanMinHash)
lmh._initialize_slots(*self.__slots__)
return lmh
[docs]
def bytesize(self, byteorder="@") -> int:
"""Compute the byte size after serialization.
Args:
byteorder (str, optional): This is byte order of the serialized data. Use one
of the `byte order characters
<https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
``@``, ``=``, ``<``, ``>``, and ``!``.
Default is ``@`` -- the native order.
Returns:
int: Size in number of bytes after serialization.
"""
# Use 8 bytes to store the seed integer
seed_size = struct.calcsize(byteorder + "q")
# Use 4 bytes to store the number of hash values
length_size = struct.calcsize(byteorder + "i")
# Use 4 bytes to store each hash value as we are using the lower 32 bit
hashvalue_size = struct.calcsize(byteorder + "I")
return seed_size + length_size + len(self) * hashvalue_size
[docs]
def serialize(self, buf, byteorder="@") -> None:
"""
Serialize this lean MinHash and store the result in an allocated buffer.
Args:
buf (buffer): `buf` must implement the `buffer`_ interface.
One such example is the built-in `bytearray`_ class.
byteorder (str, optional): This is byte order of the serialized data. Use one
of the `byte order characters
<https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
``@``, ``=``, ``<``, ``>``, and ``!``.
Default is ``@`` -- the native order.
This is preferred over using `pickle`_ if the serialized lean MinHash needs
to be used by another program in a different programming language.
The serialization schema:
1. The first 8 bytes is the seed integer
2. The next 4 bytes is the number of hash values
3. The rest is the serialized hash values, each uses 4 bytes
Example:
To serialize a single lean MinHash into a `bytearray`_ buffer.
.. code-block:: python
buf = bytearray(lean_minhash.bytesize())
lean_minhash.serialize(buf)
To serialize multiple lean MinHash into a `bytearray`_ buffer.
.. code-block:: python
# assuming lean_minhashs is a list of LeanMinHash with the same size
size = lean_minhashs[0].bytesize()
buf = bytearray(size*len(lean_minhashs))
for i, lean_minhash in enumerate(lean_minhashs):
lean_minhash.serialize(buf[i*size:])
.. _`buffer`: https://docs.python.org/3/c-api/buffer.html
.. _`bytearray`: https://docs.python.org/3.6/library/functions.html#bytearray
.. _`byteorder`: https://docs.python.org/3/library/struct.html
"""
if len(buf) < self.bytesize():
raise ValueError(
"The buffer does not have enough space\
for holding this MinHash."
)
fmt = "%sqi%dI" % (byteorder, len(self))
struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues)
[docs]
@classmethod
def deserialize(cls, buf, byteorder="@") -> LeanMinHash:
"""
Deserialize a lean MinHash from a buffer.
Args:
buf (buffer): `buf` must implement the `buffer`_ interface.
One such example is the built-in `bytearray`_ class.
byteorder (str. optional): This is byte order of the serialized data. Use one
of the `byte order characters
<https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
``@``, ``=``, ``<``, ``>``, and ``!``.
Default is ``@`` -- the native order.
Return:
datasketch.LeanMinHash: The deserialized lean MinHash
Example:
To deserialize a lean MinHash from a buffer.
.. code-block:: python
lean_minhash = LeanMinHash.deserialize(buf)
"""
fmt_seed_size = "%sqi" % byteorder
fmt_hash = byteorder + "%dI"
try:
seed, num_perm = struct.unpack_from(fmt_seed_size, buf, 0)
except TypeError:
seed, num_perm = struct.unpack_from(fmt_seed_size, buffer(buf), 0)
offset = struct.calcsize(fmt_seed_size)
try:
hashvalues = struct.unpack_from(fmt_hash % num_perm, buf, offset)
except TypeError:
hashvalues = struct.unpack_from(fmt_hash % num_perm, buffer(buf), offset)
lmh = object.__new__(LeanMinHash)
lmh._initialize_slots(seed, hashvalues)
return lmh
[docs]
def __getstate__(self):
buf = bytearray(self.bytesize())
fmt = "qi%dI" % len(self)
struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues)
return buf
def __setstate__(self, buf):
try:
seed, num_perm = struct.unpack_from("qi", buf, 0)
except TypeError:
seed, num_perm = struct.unpack_from("qi", buffer(buf), 0)
offset = struct.calcsize("qi")
try:
hashvalues = struct.unpack_from("%dI" % num_perm, buf, offset)
except TypeError:
hashvalues = struct.unpack_from("%dI" % num_perm, buffer(buf), offset)
self._initialize_slots(seed, hashvalues)
[docs]
def __hash__(self) -> int:
return hash((self.seed, tuple(self.hashvalues)))
[docs]
@classmethod
def union(cls, *lmhs: LeanMinHash) -> LeanMinHash:
"""Create a new lean MinHash by unioning multiple lean MinHash."""
if len(lmhs) < 2:
raise ValueError("Cannot union less than 2 MinHash")
num_perm = len(lmhs[0])
seed = lmhs[0].seed
if any((seed != m.seed or num_perm != len(m)) for m in lmhs):
raise ValueError(
"The unioning MinHash must have the\
same seed, number of permutation functions."
)
hashvalues = np.minimum.reduce([m.hashvalues for m in lmhs])
lmh = object.__new__(LeanMinHash)
lmh._initialize_slots(seed, hashvalues)
return lmh