Source code for datasketch.weighted_minhash

from __future__ import annotations
from typing import Union, List

import copy

import numpy as np
import scipy as sp

[docs] class WeightedMinHash(object): """New weighted MinHash is generated by :class:`WeightedMinHashGenerator`. You can also initialize a weighted MinHash by using the state from an existing one. Args: seed (int): The random seed used to generate this weighted MinHash. hashvalues (numpy.ndarray): The internal state of this weighted MinHash. """
[docs] def __init__(self, seed: int, hashvalues: np.ndarray) -> None: self.seed = seed self.hashvalues = hashvalues
[docs] def jaccard(self, other: WeightedMinHash) -> float: """Estimate the `weighted Jaccard similarity`_ between the multi-sets represented by this weighted MinHash and the other. Args: other (WeightedMinHash): The other weighted MinHash. Returns: float: The weighted Jaccard similarity between 0.0 and 1.0. Raises: ValueError: If the two weighted MinHash objects have different seeds or different numbers of hash values. .. _`weighted Jaccard similarity`: """ if other.seed != self.seed: raise ValueError( "Cannot compute Jaccard given WeightedMinHash objects with\ different seeds" ) if len(self) != len(other): raise ValueError( "Cannot compute Jaccard given WeightedMinHash objects with\ different numbers of hash values" ) # Check how many pairs of (k, t) hashvalues are equal intersection = 0 for this, that in zip(self.hashvalues, other.hashvalues): if np.array_equal(this, that): intersection += 1 return float(intersection) / float(len(self))
[docs] def digest(self) -> np.ndarray: """Export the hash values, which is the internal state of the weighted MinHash. Returns: numpy.ndarray: The hash values which is a Numpy array. """ return copy.copy(self.hashvalues)
[docs] def copy(self) -> WeightedMinHash: """ Returns: WeightedMinHash: A copy of this weighted MinHash by exporting its state. """ return WeightedMinHash(self.seed, self.digest())
[docs] def __len__(self) -> int: """ Returns: int: The number of hash values. """ return len(self.hashvalues)
[docs] def __eq__(self, other) -> bool: """ Returns: bool: If their seeds and hash values are both equal then two are equivalent. """ return ( type(self) is type(other) and self.seed == other.seed and np.array_equal(self.hashvalues, other.hashvalues) )
[docs] class WeightedMinHashGenerator(object): """The weighted MinHash generator is used for creating new :class:`WeightedMinHash` objects. This weighted MinHash implementation is based on Sergey Ioffe's paper, `Improved Consistent Sampling, Weighted Minhash and L1 Sketching <>`_ Args: dim (int): The number of dimensions of the input Jaccard vectors. sample_size (int): The number of samples to use for creating weighted MinHash. seed (int): The random seed to use for generating permutation functions. """
[docs] def __init__(self, dim: int, sample_size: int = 128, seed: int = 1) -> None: self.dim = dim self.sample_size = sample_size self.seed = seed generator = np.random.RandomState(seed=seed) = generator.gamma(2, 1, (sample_size, dim)).astype(np.float32) self.ln_cs = np.log(generator.gamma(2, 1, (sample_size, dim))).astype( np.float32 ) self.betas = generator.uniform(0, 1, (sample_size, dim)).astype(np.float32)
[docs] def minhash(self, v: np.ndarray) -> WeightedMinHash: """Create a new weighted MinHash given a weighted Jaccard vector. Each dimension is an integer frequency of the corresponding element in the multi-set represented by the vector. Args: v (numpy.ndarray): The Jaccard vector. Returns: WeightedMinHash: The weighted MinHash. """ if not isinstance(v, raise TypeError("Input vector must be an iterable") if not len(v) == self.dim: raise ValueError("Input dimension mismatch, expecting %d" % self.dim) if not isinstance(v, np.ndarray): v = np.array(v, dtype=np.float32) elif v.dtype != np.float32: v = v.astype(np.float32) hashvalues = np.zeros((self.sample_size, 2), dtype=int) vzeros = v == 0 if vzeros.all(): raise ValueError("Input is all zeros") v[vzeros] = np.nan vlog = np.log(v) for i in range(self.sample_size): t = np.floor((vlog /[i]) + self.betas[i]) ln_y = (t - self.betas[i]) *[i] ln_a = self.ln_cs[i] - ln_y -[i] k = np.nanargmin(ln_a) hashvalues[i][0], hashvalues[i][1] = k, int(t[k]) return WeightedMinHash(self.seed, hashvalues)
[docs] def minhash_many( self, X: Union[sp.sparse.spmatrix, np.ndarray] ) -> List[Union[WeightedMinHash, None]]: """Create new WeightedMinHash instances given a matrix of weighted Jaccard vectors. In the input matrix X, each row corresponds to a multi-set, and each column stores the integer frequency of the element of a dimension. Note: This method is experimental and does not yield the same MinHash hash values as :meth:`minhash`. Args: X (Union[scipy.sparse.spmatrix, numpy.ndarray]): A matrix of Jaccard vectors (rows). Returns: List[Union[WeightedMinHash, None]] - A list of length X.shape[0]. Each element is either a :class:`WeightedMinHash` instance or None (if the original row in X is empty). """ # Input validation if not isinstance(X, (sp.sparse.spmatrix, np.ndarray)): raise TypeError("Input X must be a sparse matrix or numpy matrix") if X.ndim != 2: raise ValueError("Input must have two dimensions") if X.shape[1] != self.dim: raise ValueError("Input dimension mismatch, expecting %d" % self.dim) # Clean up X X = sp.sparse.csr_matrix(X, dtype=np.float32, copy=True) X.sort_indices() num_docs = X.shape[0] all_hashvalues = [None for _ in range(num_docs)] # Grab nonzero index information ridx, cidx = X.nonzero() rowends = X.indptr.tolist() rowends.pop(0) rowends.append(X.nnz) it_doc, doc_begin, doc_end = None, 0, rowends[0] # Generate temporary data rs_cidx = np.array(, copy=True)[:, cidx] # sample_size x dims betas_cidx = np.array(self.betas, copy=True)[:, cidx] # sample_size x dims ln_cs_cidx = np.array(self.ln_cs, copy=True)[:, cidx] # sample_size x dims log_data = np.log(X[ridx, cidx].getA1()) log_data = np.vstack([log_data] * self.sample_size) # sample_size x dims # Unary transformations t = np.floor(log_data / rs_cidx + betas_cidx) # sample_size x dims ln_y = (t - betas_cidx + 1) * rs_cidx ln_a = ln_cs_cidx - ln_y # Compute all samples simultaneously to take advantage of numpy's tight # C for loops to improve performance for it_doc in range(X.shape[0]): doc_end = rowends[it_doc] if doc_begin != doc_end: doc_cidx = cidx[doc_begin:doc_end] doc_ln_a = ln_a[:, doc_begin:doc_end] doc_argmin = np.argmin(doc_ln_a, axis=1) doc_k = doc_cidx[doc_argmin] all_hashvalues[it_doc] = np.zeros((self.sample_size, 2), dtype=int) hashvalues = all_hashvalues[it_doc] hashvalues[:, 0], hashvalues[:, 1] = ( doc_k, t[np.arange(self.sample_size), doc_begin + doc_argmin], ) doc_begin = doc_end # Create the WeightedMinHash instances for non-empty documents ret = [None] * X.shape[0] for it_doc, hashvalues in enumerate(all_hashvalues): if hashvalues is not None: ret[it_doc] = WeightedMinHash(self.seed, hashvalues) return ret